diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 4fd88ea81c84a..91e719c52d436 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -44,6 +44,8 @@ pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt
 # see https://github.com/llvm/llvm-project/pull/82393 and
 # https://discourse.llvm.org/t/rfc-future-of-windows-pre-commit-ci/76840/40
 # for further information.
+# We limit the number of parallel compile jobs to 24 control memory
+# consumption and improve build reliability.
 cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D LLVM_ENABLE_PROJECTS="${projects}" \
       -G Ninja \
@@ -58,7 +60,9 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON \
       -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \
-      -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO"
+      -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \
+      -D LLVM_PARALLEL_COMPILE_JOBS=16 \
+      -D LLVM_PARALLEL_LINK_JOBS=4
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
diff --git a/.github/workflows/restart-preempted-libcxx-jobs.yaml b/.github/workflows/restart-preempted-libcxx-jobs.yaml
new file mode 100644
index 0000000000000..71e27ff2abb9f
--- /dev/null
+++ b/.github/workflows/restart-preempted-libcxx-jobs.yaml
@@ -0,0 +1,105 @@
+name: Restart Preempted Libc++ Workflow
+
+# The libc++ builders run on preemptable VMs, which can be shutdown at any time.
+# This workflow identifies when a workflow run was canceled due to the VM being preempted,
+# and restarts the workflow run.
+
+# We identify a canceled workflow run by checking the annotations of the check runs in the check suite,
+# which should contain the message "The runner has received a shutdown signal."
+
+# Note: If a job is both preempted and also contains a non-preemption failure, we do not restart the workflow.
+
+on:
+  workflow_run:
+    workflows: [Build and Test libc\+\+]
+    types:
+      - completed
+
+permissions:
+  contents: read
+
+jobs:
+  restart:
+    if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure' || github.event.workflow_run.conclusion == 'cancelled')
+    name: "Restart Job"
+    permissions:
+      statuses: read
+      checks: read
+      actions: write
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Restart Job"
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+        with:
+          script: |
+            const failure_regex = /Process completed with exit code 1./
+            const preemption_regex = /The runner has received a shutdown signal/
+            
+            console.log('Listing check runs for suite')
+            const check_suites = await github.rest.checks.listForSuite({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              check_suite_id: context.payload.workflow_run.check_suite_id
+            })
+
+            check_run_ids = [];
+            for (check_run of check_suites.data.check_runs) {
+              console.log('Checking check run: ' + check_run.id);
+              if (check_run.status != 'completed') {
+                console.log('Check run was not completed. Skipping.');
+                continue;
+              }
+              if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') {
+                console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.');
+                continue;
+              }
+              check_run_ids.push(check_run.id);
+            }
+            
+            has_preempted_job = false;
+
+            for (check_run_id of check_run_ids) {
+              console.log('Listing annotations for check run: ' + check_run_id);
+                 
+              annotations = await github.rest.checks.listAnnotations({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                check_run_id: check_run_id
+              })
+              
+              for (annotation of annotations.data) {
+                if (annotation.annotation_level != 'failure') {
+                  continue;
+                }
+                
+                const preemption_match = annotation.message.match(preemption_regex);
+              
+                if (preemption_match != null) {
+                  console.log('Found preemption message: ' + annotation.message);
+                  has_preempted_job = true;
+                }
+                
+                const failure_match = annotation.message.match(failure_regex);
+                if (failure_match != null) {
+                  // We only want to restart the workflow if all of the failures were due to preemption.
+                  // We don't want to restart the workflow if there were other failures.
+                  console.log('Choosing not to rerun workflow because we found a non-preemption failure');
+                  console.log('Failure message: ' + annotation.message);
+                  return;
+                }
+              }
+            } 
+             
+            if (!has_preempted_job) {
+              console.log('No preempted jobs found. Not restarting workflow.');
+              return;
+            }
+            
+            console.log("Restarted workflow: " + context.payload.workflow_run.id);
+            await github.rest.actions.reRunWorkflowFailedJobs({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                run_id: context.payload.workflow_run.id
+              })
+            
+        
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
index c3208392df156..828f13805a698 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
@@ -1414,13 +1414,21 @@ IdentifierNamingCheck::getDiagInfo(const NamingCheckId &ID,
                   }};
 }
 
+StringRef IdentifierNamingCheck::getRealFileName(StringRef FileName) const {
+  auto Iter = RealFileNameCache.try_emplace(FileName);
+  SmallString<256U> &RealFileName = Iter.first->getValue();
+  if (!Iter.second)
+    return RealFileName;
+  llvm::sys::fs::real_path(FileName, RealFileName);
+  return RealFileName;
+}
+
 const IdentifierNamingCheck::FileStyle &
 IdentifierNamingCheck::getStyleForFile(StringRef FileName) const {
   if (!GetConfigPerFile)
     return *MainFileStyle;
 
-  SmallString<128> RealFileName;
-  llvm::sys::fs::real_path(FileName, RealFileName);
+  StringRef RealFileName = getRealFileName(FileName);
   StringRef Parent = llvm::sys::path::parent_path(RealFileName);
   auto Iter = NamingStylesCache.find(Parent);
   if (Iter != NamingStylesCache.end())
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
index 27c8e4bc768c4..646ec0eac8dd1 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
@@ -205,6 +205,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck {
                        const NamingCheckFailure &Failure) const override;
 
   const FileStyle &getStyleForFile(StringRef FileName) const;
+  StringRef getRealFileName(StringRef FileName) const;
 
   /// Find the style kind of a field in an anonymous record.
   StyleKind findStyleKindForAnonField(
@@ -222,6 +223,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck {
   /// Stores the style options as a vector, indexed by the specified \ref
   /// StyleKind, for a given directory.
   mutable llvm::StringMap<FileStyle> NamingStylesCache;
+  mutable llvm::StringMap<SmallString<256U>> RealFileNameCache;
   FileStyle *MainFileStyle;
   ClangTidyContext *Context;
   const bool GetConfigPerFile;
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst
index 44d97f7b363bf..271970c292c8f 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-length.rst
@@ -28,10 +28,7 @@ The following options are described below:
 
     .. code-block:: c++
 
-         int doubler(int x)   // warns that x is too short
-         {
-            return 2 * x;
-         }
+      int i = 42;    // warns that 'i' is too short
 
     This check does not have any fix suggestions in the general case since
     variable names have semantic value.
@@ -50,7 +47,10 @@ The following options are described below:
 
     .. code-block:: c++
 
-      int i = 42;    // warns that 'i' is too short
+         int doubler(int x)   // warns that x is too short
+         {
+            return 2 * x;
+         }
 
     This check does not have any fix suggestions in the general case since
     variable names have semantic value.
diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index b3e2b870ae5f9..3d21e37784b36 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -123,6 +123,44 @@ severe that error recovery won't be able to recover sensibly from them (thus
 spewing a ton of bogus errors).  One example of this class of error are failure
 to ``#include`` a file.
 
+Diagnostic Wording
+^^^^^^^^^^^^^^^^^^
+The wording used for a diagnostic is critical because it is the only way for a
+user to know how to correct their code. Use the following suggestions when
+wording a diagnostic.
+
+* Diagnostics in Clang do not start with a capital letter and do not end with
+  punctuation.
+
+    * This does not apply to proper nouns like ``Clang`` or ``OpenMP``, to
+      acronyms like ``GCC`` or ``ARC``, or to language standards like ``C23``
+      or ``C++17``.
+    * A trailing question mark is allowed. e.g., ``unknown identifier %0; did
+      you mean %1?``.
+
+* Appropriately capitalize proper nouns like ``Clang``, ``OpenCL``, ``GCC``,
+  ``Objective-C``, etc and language standard versions like ``C11`` or ``C++11``.
+* The wording should be succinct. If necessary, use a semicolon to combine
+  sentence fragments instead of using complete sentences. e.g., prefer wording
+  like ``'%0' is deprecated; it will be removed in a future release of Clang``
+  over wording like ``'%0' is deprecated. It will be removed in a future release
+  of Clang``.
+* The wording should be actionable and avoid using standards terms or grammar
+  productions that a new user would not be familiar with. e.g., prefer wording
+  like ``missing semicolon`` over wording like ``syntax error`` (which is not
+  actionable) or ``expected unqualified-id`` (which uses standards terminology).
+* The wording should clearly explain what is wrong with the code rather than
+  restating what the code does. e.g., prefer wording like ``type %0 requires a
+  value in the range %1 to %2`` over wording like ``%0 is invalid``.
+* The wording should have enough contextual information to help the user
+  identify the issue in a complex expression. e.g., prefer wording like
+  ``both sides of the %0 binary operator are identical`` over wording like
+  ``identical operands to binary operator``.
+* Use single quotes to denote syntactic constructs or command line arguments
+  named in a diagnostic message. e.g., prefer wording like ``'this' pointer
+  cannot be null in well-defined C++ code`` over wording like ``this pointer
+  cannot be null in well-defined C++ code``.
+
 The Format String
 ^^^^^^^^^^^^^^^^^
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 182f8b5824258..bd92818f0c09d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -541,6 +541,9 @@ Improvements to Clang's diagnostics
 - Clang emits a ``-Wparentheses`` warning for expressions with consecutive comparisons like ``x < y < z``.
   Fixes #GH20456.
 
+- Clang no longer emits a "declared here" note for a builtin function that has no declaration in source.
+  Fixes #GH93369.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -629,6 +632,9 @@ Bug Fixes in This Version
 - ``__is_array`` and ``__is_bounded_array`` no longer return ``true`` for
   zero-sized arrays. Fixes (#GH54705).
 
+- Correctly reject declarations where a statement is required in C.
+  Fixes #GH92775
+
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -802,6 +808,9 @@ Bug Fixes to C++ Support
 - Fixed a regression introduced in Clang 18 causing a static function overloading a non-static function
   with the same parameters not to be diagnosed. (Fixes #GH93456).
 - Clang now diagnoses unexpanded parameter packs in attributes. (Fixes #GH93269).
+- Clang now allows ``@$``` in raw string literals. Fixes (#GH93130).
+- Fix an assertion failure when checking invalid ``this`` usage in the wrong context. (Fixes #GH91536).
+- Clang no longer models dependent NTTP arguments as ``TemplateParamObjectDecl`` s. Fixes (#GH84052).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index b706864798baa..04daf511f5871 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -31,6 +31,8 @@ class OpenACCConstructStmt : public Stmt {
   /// The location of the directive statement, from the '#' to the last token of
   /// the directive.
   SourceRange Range;
+  /// The location of the directive name.
+  SourceLocation DirectiveLoc;
 
   /// The list of clauses.  This is stored here as an ArrayRef, as this is the
   /// most convienient place to access the list, however the list itself should
@@ -39,8 +41,9 @@ class OpenACCConstructStmt : public Stmt {
 
 protected:
   OpenACCConstructStmt(StmtClass SC, OpenACCDirectiveKind K,
-                       SourceLocation Start, SourceLocation End)
-      : Stmt(SC), Kind(K), Range(Start, End) {}
+                       SourceLocation Start, SourceLocation DirectiveLoc,
+                       SourceLocation End)
+      : Stmt(SC), Kind(K), Range(Start, End), DirectiveLoc(DirectiveLoc) {}
 
   // Used only for initialization, the leaf class can initialize this to
   // trailing storage.
@@ -59,6 +62,7 @@ class OpenACCConstructStmt : public Stmt {
 
   SourceLocation getBeginLoc() const { return Range.getBegin(); }
   SourceLocation getEndLoc() const { return Range.getEnd(); }
+  SourceLocation getDirectiveLoc() const { return DirectiveLoc; }
   ArrayRef<const OpenACCClause *> clauses() const { return Clauses; }
 
   child_range children() {
@@ -81,9 +85,11 @@ class OpenACCAssociatedStmtConstruct : public OpenACCConstructStmt {
 
 protected:
   OpenACCAssociatedStmtConstruct(StmtClass SC, OpenACCDirectiveKind K,
-                                 SourceLocation Start, SourceLocation End,
-                                 Stmt *AssocStmt)
-      : OpenACCConstructStmt(SC, K, Start, End), AssociatedStmt(AssocStmt) {}
+                                 SourceLocation Start,
+                                 SourceLocation DirectiveLoc,
+                                 SourceLocation End, Stmt *AssocStmt)
+      : OpenACCConstructStmt(SC, K, Start, DirectiveLoc, End),
+        AssociatedStmt(AssocStmt) {}
 
   void setAssociatedStmt(Stmt *S) { AssociatedStmt = S; }
   Stmt *getAssociatedStmt() { return AssociatedStmt; }
@@ -126,10 +132,10 @@ class OpenACCComputeConstruct final
   friend class ASTStmtReader;
   friend class ASTContext;
   OpenACCComputeConstruct(unsigned NumClauses)
-      : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass,
-                                       OpenACCDirectiveKind::Invalid,
-                                       SourceLocation{}, SourceLocation{},
-                                       /*AssociatedStmt=*/nullptr) {
+      : OpenACCAssociatedStmtConstruct(
+            OpenACCComputeConstructClass, OpenACCDirectiveKind::Invalid,
+            SourceLocation{}, SourceLocation{}, SourceLocation{},
+            /*AssociatedStmt=*/nullptr) {
     // We cannot send the TrailingObjects storage to the base class (which holds
     // a reference to the data) until it is constructed, so we have to set it
     // separately here.
@@ -141,11 +147,11 @@ class OpenACCComputeConstruct final
   }
 
   OpenACCComputeConstruct(OpenACCDirectiveKind K, SourceLocation Start,
-                          SourceLocation End,
+                          SourceLocation DirectiveLoc, SourceLocation End,
                           ArrayRef<const OpenACCClause *> Clauses,
                           Stmt *StructuredBlock)
       : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, K, Start,
-                                       End, StructuredBlock) {
+                                       DirectiveLoc, End, StructuredBlock) {
     assert(isOpenACCComputeDirectiveKind(K) &&
            "Only parallel, serial, and kernels constructs should be "
            "represented by this type");
@@ -169,8 +175,8 @@ class OpenACCComputeConstruct final
                                               unsigned NumClauses);
   static OpenACCComputeConstruct *
   Create(const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc,
-         SourceLocation EndLoc, ArrayRef<const OpenACCClause *> Clauses,
-         Stmt *StructuredBlock);
+         SourceLocation DirectiveLoc, SourceLocation EndLoc,
+         ArrayRef<const OpenACCClause *> Clauses, Stmt *StructuredBlock);
 
   Stmt *getStructuredBlock() { return getAssociatedStmt(); }
   const Stmt *getStructuredBlock() const {
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index e59cccccdd369..ef9df1e9d8b4a 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2025,9 +2025,12 @@ def Convergent : InheritableAttr {
 def NoInline : DeclOrStmtAttr {
   let Spellings = [CustomKeyword<"__noinline__">, GCC<"noinline">,
                    CXX11<"clang", "noinline">, C23<"clang", "noinline">,
+                   CXX11<"msvc", "noinline">, C23<"msvc", "noinline">,
                    Declspec<"noinline">];
-  let Accessors = [Accessor<"isClangNoInline", [CXX11<"clang", "noinline">,
-                                                C23<"clang", "noinline">]>];
+  let Accessors = [Accessor<"isStmtNoInline", [CXX11<"clang", "noinline">,
+                                               C23<"clang", "noinline">,
+                                               CXX11<"msvc", "noinline">,
+                                               C23<"msvc", "noinline">]>];
   let Documentation = [NoInlineDocs];
   let Subjects = SubjectList<[Function, Stmt], WarnDiag,
                              "functions and statements">;
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index fd8c1b480d6da..4e48ff48b60f5 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -135,6 +135,10 @@ TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_min_f16x8, "V8hV8hV8h", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "half-precision")
 
 TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128")
diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h
index d807955311828..d71857e8e5dcc 100644
--- a/clang/include/clang/Basic/CharInfo.h
+++ b/clang/include/clang/Basic/CharInfo.h
@@ -28,8 +28,7 @@ namespace charinfo {
     CHAR_LOWER    = 0x0040,  // a-z
     CHAR_UNDER    = 0x0080,  // _
     CHAR_PERIOD   = 0x0100,  // .
-    CHAR_RAWDEL   = 0x0200,  // {}[]#<>%:;?*+-/^&|~!=,"'
-    CHAR_PUNCT    = 0x0400   // `$@()
+    CHAR_PUNCT    = 0x0200,  // {}[]#<>%:;?*+-/^&|~!=,"'`$@()
   };
 
   enum {
@@ -152,7 +151,7 @@ LLVM_READONLY inline bool isHexDigit(unsigned char c) {
 /// Note that '_' is both a punctuation character and an identifier character!
 LLVM_READONLY inline bool isPunctuation(unsigned char c) {
   using namespace charinfo;
-  return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0;
+  return (InfoTable[c] & (CHAR_UNDER | CHAR_PERIOD | CHAR_PUNCT)) != 0;
 }
 
 /// Return true if this character is an ASCII printable character; that is, a
@@ -160,8 +159,8 @@ LLVM_READONLY inline bool isPunctuation(unsigned char c) {
 /// terminal.
 LLVM_READONLY inline bool isPrintable(unsigned char c) {
   using namespace charinfo;
-  return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT|
-                          CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0;
+  return (InfoTable[c] & (CHAR_UPPER | CHAR_LOWER | CHAR_PERIOD | CHAR_PUNCT |
+                          CHAR_DIGIT | CHAR_UNDER | CHAR_SPACE)) != 0;
 }
 
 /// Return true if this is the body character of a C preprocessing number,
@@ -175,8 +174,9 @@ LLVM_READONLY inline bool isPreprocessingNumberBody(unsigned char c) {
 /// Return true if this is the body character of a C++ raw string delimiter.
 LLVM_READONLY inline bool isRawStringDelimBody(unsigned char c) {
   using namespace charinfo;
-  return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|
-                          CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0;
+  return (InfoTable[c] & (CHAR_UPPER | CHAR_LOWER | CHAR_PERIOD | CHAR_DIGIT |
+                          CHAR_UNDER | CHAR_PUNCT)) != 0 &&
+         c != '(' && c != ')';
 }
 
 enum class EscapeChar {
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 5a4551a96ca4e..25fbfe83fa2bc 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -111,6 +111,14 @@ def warn_cxx98_compat_raw_string_literal : Warning<
   "raw string literals are incompatible with C++98">,
   InGroup<CXX98Compat>, DefaultIgnore;
 
+def warn_cxx26_compat_raw_string_literal_character_set : Warning<
+  " '%0' in a raw string literal delimiter is incompatible "
+  "with standards before C++2c">,
+  InGroup<CXXPre26Compat>, DefaultIgnore;
+def ext_cxx26_raw_string_literal_character_set : Extension<
+  " '%0' in a raw string literal delimiter is a C++2c extension">,
+  InGroup<CXX26>, DefaultIgnore;
+
 def warn_multichar_character_literal : Warning<
   "multi-character character constant">, InGroup<MultiChar>;
 def warn_four_char_character_literal : Warning<
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 80e635e4a57ec..564a58e4eb670 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -146,6 +146,25 @@ let TargetGuard = "sme" in {
                              [IsOverloadNone, IsStreamingCompatible, IsOutZA]>;
 }
 
+let TargetGuard = "sme2p1" in {
+  def SVZERO_ZA64_VG1x2 : SInst<"svzero_za64_vg1x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg1x2",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG1x4 : SInst<"svzero_za64_vg1x4", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg1x4",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG2x1 : SInst<"svzero_za64_vg2x1", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg2x1",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG2x2 : SInst<"svzero_za64_vg2x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg2x2",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG2x4 : SInst<"svzero_za64_vg2x4", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg2x4",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG4x1 : SInst<"svzero_za64_vg4x1", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg4x1",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG4x2 : SInst<"svzero_za64_vg4x2", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg4x2",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+  def SVZERO_ZA64_VG4x4 : SInst<"svzero_za64_vg4x4", "vm", "", MergeNone, "aarch64_sme_zero_za64_vg4x4",
+                            [IsOverloadNone, IsStreaming, IsInOutZA]>;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // SME - Counting elements in a streaming vector
 
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 03570f94de666..88938a981fd8a 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2151,6 +2151,11 @@ let TargetGuard = "sme2" in {
   def SVFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]",  "44dd",   "hfd",      MergeNone, "aarch64_sve_fclamp_single_x4",  [IsStreaming], []>;
 }
 
+let TargetGuard = "sme2,b16b16"in {
+  def SVBFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "b",      MergeNone, "aarch64_sve_bfclamp_single_x2",  [IsStreaming], []>;
+  def SVBFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]",  "44dd",   "b",      MergeNone, "aarch64_sve_bfclamp_single_x4",  [IsStreaming], []>;
+}
+
 let TargetGuard = "sme2" in {
 // == ADD (vectors) ==
   def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;
@@ -2265,6 +2270,10 @@ let TargetGuard = "sme2" in {
   def SVCVT_S32_F32_X4 : SInst<"svcvt_{d}[_f32_x4]", "4.d4.M", "i",  MergeNone, "aarch64_sve_fcvtzs_x4", [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
 }
 
+let TargetGuard = "sme-f16f16" in {
+  def SVCVT_F32_X2 : SInst<"svcvt_{d}[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvt_widen_x2", [ IsStreaming],[]>;
+}
+
 //
 // Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16
 //
@@ -2273,6 +2282,13 @@ let TargetGuard = "sme2" in {
   def SVCVTN_BF16_X2 : SInst<"svcvtn_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvtn_x2", [IsOverloadNone, IsStreaming],[]>;
 }
 
+//
+//Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
+//
+let TargetGuard = "sme-f16f16" in {
+  def SVCVTL_F32_X2 : SInst<"svcvtl_f32[_f16_x2]", "2h", "f", MergeNone, "aarch64_sve_fcvtl_widen_x2", [ IsStreaming],[]>;
+}
+
 //
 // Multi-vector saturating extract narrow
 //
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index de2f245fb29f8..4119e69c85540 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6277,11 +6277,9 @@ def mapx_features_EQ : CommaJoined<["-"], "mapx-features=">, Group<m_x86_Feature
     HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">;
 def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, Group<m_x86_Features_Group>,
     HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf">;
-// Features egpr, push2pop2, ppx and ndd are validated with llvm-test-suite && cpu2017 on Intel SDE.
-// For stability, we turn on these features only for -mapxf. After a feature pass the validation,
-// we will add it to -mapxf.
-def mapxf : Flag<["-"], "mapxf">, Alias<mapx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx", "ndd"]>;
-def mno_apxf : Flag<["-"], "mno-apxf">, Alias<mno_apx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd"]>;
+// For stability, we only add a feature to -mapxf after it passes the validation of llvm-test-suite && cpu2017 on Intel SDE.
+def mapxf : Flag<["-"], "mapxf">, Alias<mapx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf"]>;
+def mno_apxf : Flag<["-"], "mno-apxf">, Alias<mno_apx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd","ccmp","nf"]>;
 } // let Flags = [TargetSpecific]
 
 // VE feature flags
diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
index 8ccebe457ed53..76d7fd798bed3 100644
--- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
+++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
@@ -21,6 +21,7 @@
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/ParentMapContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/LLVM.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Specifiers.h"
@@ -127,7 +128,7 @@ class ExtractAPIVisitorBase : public RecursiveASTVisitor<Derived> {
 protected:
   /// Collect API information for the enum constants and associate with the
   /// parent enum.
-  void recordEnumConstants(EnumRecord *EnumRecord,
+  void recordEnumConstants(SymbolReference Container,
                            const EnumDecl::enumerator_range Constants);
 
   /// Collect API information for the Objective-C methods and associate with the
@@ -248,12 +249,8 @@ class ExtractAPIVisitorBase : public RecursiveASTVisitor<Derived> {
     clang::index::generateUSRForDecl(Tag, TagUSR);
     if (auto *Record = llvm::dyn_cast_if_present<TagRecord>(
             API.findRecordForUSR(TagUSR))) {
-      if (Record->IsEmbeddedInVarDeclarator) {
+      if (Record->IsEmbeddedInVarDeclarator)
         NewRecordContext->stealRecordChain(*Record);
-        auto *NewRecord = cast<APIRecord>(NewRecordContext);
-        if (NewRecord->Comment.empty())
-          NewRecord->Comment = Record->Comment;
-      }
     }
   }
 };
@@ -394,17 +391,6 @@ bool ExtractAPIVisitorBase<Derived>::VisitEnumDecl(const EnumDecl *Decl) {
   if (!getDerivedExtractAPIVisitor().shouldDeclBeIncluded(Decl))
     return true;
 
-  SmallString<128> QualifiedNameBuffer;
-  // Collect symbol information.
-  StringRef Name = Decl->getName();
-  if (Name.empty())
-    Name = getTypedefName(Decl);
-  if (Name.empty()) {
-    llvm::raw_svector_ostream OS(QualifiedNameBuffer);
-    Decl->printQualifiedName(OS);
-    Name = QualifiedNameBuffer;
-  }
-
   SmallString<128> USR;
   index::generateUSRForDecl(Decl, USR);
   PresumedLoc Loc =
@@ -420,13 +406,29 @@ bool ExtractAPIVisitorBase<Derived>::VisitEnumDecl(const EnumDecl *Decl) {
       DeclarationFragmentsBuilder::getFragmentsForEnum(Decl);
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
-  auto *ER = API.createRecord<EnumRecord>(
-      USR, Name, createHierarchyInformationForDecl(*Decl), Loc,
-      AvailabilityInfo::createFromDecl(Decl), Comment, Declaration, SubHeading,
-      isInSystemHeader(Decl), isEmbeddedInVarDeclarator(*Decl));
+
+  // Collect symbol information.
+  SymbolReference ParentContainer;
+
+  if (Decl->hasNameForLinkage()) {
+    StringRef Name = Decl->getName();
+    if (Name.empty())
+      Name = getTypedefName(Decl);
+
+    auto *ER = API.createRecord<EnumRecord>(
+        USR, Name, createHierarchyInformationForDecl(*Decl), Loc,
+        AvailabilityInfo::createFromDecl(Decl), Comment, Declaration,
+        SubHeading, isInSystemHeader(Decl), false);
+    ParentContainer = SymbolReference(ER);
+  } else {
+    // If this an anonymous enum then the parent scope of the constants is the
+    // top level namespace.
+    ParentContainer = {};
+  }
 
   // Now collect information about the enumerators in this enum.
-  getDerivedExtractAPIVisitor().recordEnumConstants(ER, Decl->enumerators());
+  getDerivedExtractAPIVisitor().recordEnumConstants(ParentContainer,
+                                                    Decl->enumerators());
 
   return true;
 }
@@ -1197,7 +1199,7 @@ bool ExtractAPIVisitorBase<Derived>::VisitObjCCategoryDecl(
 /// parent enum.
 template <typename Derived>
 void ExtractAPIVisitorBase<Derived>::recordEnumConstants(
-    EnumRecord *EnumRecord, const EnumDecl::enumerator_range Constants) {
+    SymbolReference Container, const EnumDecl::enumerator_range Constants) {
   for (const auto *Constant : Constants) {
     // Collect symbol information.
     StringRef Name = Constant->getName();
@@ -1218,9 +1220,8 @@ void ExtractAPIVisitorBase<Derived>::recordEnumConstants(
         DeclarationFragmentsBuilder::getSubHeading(Constant);
 
     API.createRecord<EnumConstantRecord>(
-        USR, Name, createHierarchyInformationForDecl(*Constant), Loc,
-        AvailabilityInfo::createFromDecl(Constant), Comment, Declaration,
-        SubHeading, isInSystemHeader(Constant));
+        USR, Name, Container, Loc, AvailabilityInfo::createFromDecl(Constant),
+        Comment, Declaration, SubHeading, isInSystemHeader(Constant));
   }
 }
 
@@ -1469,7 +1470,17 @@ class ExtractAPIVisitor
 
   bool shouldDeclBeIncluded(const Decl *D) const { return true; }
   const RawComment *fetchRawCommentForDecl(const Decl *D) const {
-    return this->Context.getRawCommentForDeclNoCache(D);
+    if (const auto *Comment = this->Context.getRawCommentForDeclNoCache(D))
+      return Comment;
+
+    if (const auto *Declarator = dyn_cast<DeclaratorDecl>(D)) {
+      const auto *TagTypeDecl = Declarator->getType()->getAsTagDecl();
+      if (TagTypeDecl && TagTypeDecl->isEmbeddedInDeclarator() &&
+          TagTypeDecl->isCompleteDefinition())
+        return this->Context.getRawCommentForDeclNoCache(TagTypeDecl);
+    }
+
+    return nullptr;
   }
 };
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 8493026f5f7a6..d054b8cf0d240 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -467,15 +467,18 @@ class Parser : public CodeCompletionHandler {
 
   /// Flags describing a context in which we're parsing a statement.
   enum class ParsedStmtContext {
+    /// This context permits declarations in language modes where declarations
+    /// are not statements.
+    AllowDeclarationsInC = 0x1,
     /// This context permits standalone OpenMP directives.
-    AllowStandaloneOpenMPDirectives = 0x1,
+    AllowStandaloneOpenMPDirectives = 0x2,
     /// This context is at the top level of a GNU statement expression.
-    InStmtExpr = 0x2,
+    InStmtExpr = 0x4,
 
     /// The context of a regular substatement.
     SubStmt = 0,
     /// The context of a compound-statement.
-    Compound = AllowStandaloneOpenMPDirectives,
+    Compound = AllowDeclarationsInC | AllowStandaloneOpenMPDirectives,
 
     LLVM_MARK_AS_BITMASK_ENUM(InStmtExpr)
   };
@@ -3656,6 +3659,7 @@ class Parser : public CodeCompletionHandler {
   struct OpenACCDirectiveParseInfo {
     OpenACCDirectiveKind DirKind;
     SourceLocation StartLoc;
+    SourceLocation DirLoc;
     SourceLocation EndLoc;
     SmallVector<OpenACCClause *> Clauses;
     // TODO OpenACC: As we implement support for the Atomic, Routine, Cache, and
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index 6f69fa08939b8..66144de4340a8 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -379,7 +379,7 @@ class SemaOpenACC : public SemaBase {
   /// Called after the construct has been parsed, but clauses haven't been
   /// parsed.  This allows us to diagnose not-implemented, as well as set up any
   /// state required for parsing the clauses.
-  void ActOnConstruct(OpenACCDirectiveKind K, SourceLocation StartLoc);
+  void ActOnConstruct(OpenACCDirectiveKind K, SourceLocation DirLoc);
 
   /// Called after the directive, including its clauses, have been parsed and
   /// parsing has consumed the 'annot_pragma_openacc_end' token. This DOES
@@ -400,6 +400,7 @@ class SemaOpenACC : public SemaBase {
   /// declaration group or associated statement.
   StmtResult ActOnEndStmtDirective(OpenACCDirectiveKind K,
                                    SourceLocation StartLoc,
+                                   SourceLocation DirLoc,
                                    SourceLocation EndLoc,
                                    ArrayRef<OpenACCClause *> Clauses,
                                    StmtResult AssocStmt);
diff --git a/clang/lib/AST/APValue.cpp b/clang/lib/AST/APValue.cpp
index 8c77b563657d9..d8e33ff421c06 100644
--- a/clang/lib/AST/APValue.cpp
+++ b/clang/lib/AST/APValue.cpp
@@ -90,7 +90,7 @@ QualType APValue::LValueBase::getType() const {
   // For a materialized temporary, the type of the temporary we materialized
   // may not be the type of the expression.
   if (const MaterializeTemporaryExpr *MTE =
-          clang::dyn_cast<MaterializeTemporaryExpr>(Base)) {
+          llvm::dyn_cast<MaterializeTemporaryExpr>(Base)) {
     SmallVector<const Expr *, 2> CommaLHSs;
     SmallVector<SubobjectAdjustment, 2> Adjustments;
     const Expr *Temp = MTE->getSubExpr();
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
index a381a8dd7b62c..47899b344c97a 100644
--- a/clang/lib/AST/StmtOpenACC.cpp
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -23,15 +23,14 @@ OpenACCComputeConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) {
   return Inst;
 }
 
-OpenACCComputeConstruct *
-OpenACCComputeConstruct::Create(const ASTContext &C, OpenACCDirectiveKind K,
-                                SourceLocation BeginLoc, SourceLocation EndLoc,
-                                ArrayRef<const OpenACCClause *> Clauses,
-                                Stmt *StructuredBlock) {
+OpenACCComputeConstruct *OpenACCComputeConstruct::Create(
+    const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc,
+    SourceLocation DirLoc, SourceLocation EndLoc,
+    ArrayRef<const OpenACCClause *> Clauses, Stmt *StructuredBlock) {
   void *Mem = C.Allocate(
       OpenACCComputeConstruct::totalSizeToAlloc<const OpenACCClause *>(
           Clauses.size()));
-  auto *Inst = new (Mem)
-      OpenACCComputeConstruct(K, BeginLoc, EndLoc, Clauses, StructuredBlock);
+  auto *Inst = new (Mem) OpenACCComputeConstruct(K, BeginLoc, DirLoc, EndLoc,
+                                                 Clauses, StructuredBlock);
   return Inst;
 }
diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index a7ee973b7f7d0..b50daf5fbed6a 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -221,8 +221,13 @@ static const ValueDecl *getAsSimpleValueDeclRef(const ASTContext &Ctx,
 
   // We model class non-type template parameters as their template parameter
   // object declaration.
-  if (V.isStruct() || V.isUnion())
+  if (V.isStruct() || V.isUnion()) {
+    // Dependent types are not supposed to be described as
+    // TemplateParamObjectDecls.
+    if (T->isDependentType() || T->isInstantiationDependentType())
+      return nullptr;
     return Ctx.getTemplateParamObjectDecl(T, V);
+  }
 
   // Pointers and references with an empty path use the special 'Declaration'
   // representation.
diff --git a/clang/lib/Analysis/MacroExpansionContext.cpp b/clang/lib/Analysis/MacroExpansionContext.cpp
index 564e359668a51..b212b7f245792 100644
--- a/clang/lib/Analysis/MacroExpansionContext.cpp
+++ b/clang/lib/Analysis/MacroExpansionContext.cpp
@@ -12,7 +12,7 @@
 
 #define DEBUG_TYPE "macro-expansion-context"
 
-static void dumpTokenInto(const clang::Preprocessor &PP, clang::raw_ostream &OS,
+static void dumpTokenInto(const clang::Preprocessor &PP, llvm::raw_ostream &OS,
                           clang::Token Tok);
 
 namespace clang {
diff --git a/clang/lib/Basic/CharInfo.cpp b/clang/lib/Basic/CharInfo.cpp
index d02054c9718f5..26d693b8e9b94 100644
--- a/clang/lib/Basic/CharInfo.cpp
+++ b/clang/lib/Basic/CharInfo.cpp
@@ -31,20 +31,20 @@ const uint16_t clang::charinfo::InfoTable[256] = {
   0           , 0           , 0           , 0           ,
   //32 SP         33  !         34  "         35  #
   //36  $         37  %         38  &         39  '
-  CHAR_SPACE  , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
-  CHAR_PUNCT  , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+  CHAR_SPACE  , CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  ,
   //40  (         41  )         42  *         43  +
   //44  ,         45  -         46  .         47  /
-  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_RAWDEL , CHAR_RAWDEL ,
-  CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PERIOD , CHAR_PUNCT  ,
   //48  0         49  1         50  2         51  3
   //52  4         53  5         54  6         55  7
   CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  ,
   CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  ,
   //56  8         57  9         58  :         59  ;
   //60  <         61  =         62  >         63  ?
-  CHAR_DIGIT  , CHAR_DIGIT  , CHAR_RAWDEL , CHAR_RAWDEL ,
-  CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
+  CHAR_DIGIT  , CHAR_DIGIT  , CHAR_PUNCT  , CHAR_PUNCT  ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  ,
   //64  @         65  A         66  B         67  C
   //68  D         69  E         70  F         71  G
   CHAR_PUNCT  , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER ,
@@ -59,8 +59,8 @@ const uint16_t clang::charinfo::InfoTable[256] = {
   CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  ,
   //88  X         89  Y         90  Z         91  [
   //92  \         93  ]         94  ^         95  _
-  CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_RAWDEL ,
-  CHAR_PUNCT  , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER  ,
+  CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_PUNCT  ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  , CHAR_UNDER  ,
   //96  `         97  a         98  b         99  c
   //100  d       101  e        102  f        103  g
   CHAR_PUNCT  , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER ,
@@ -75,6 +75,6 @@ const uint16_t clang::charinfo::InfoTable[256] = {
   CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  ,
   //120  x       121  y        122  z        123  {
   //124  |       125  }        126  ~        127 DEL
-  CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_RAWDEL ,
-  CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
+  CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_PUNCT  ,
+  CHAR_PUNCT  , CHAR_PUNCT  , CHAR_PUNCT  , 0
 };
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 3a30cff917bb4..08e44360bfbe3 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -961,7 +961,7 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasCF)
     Builder.defineMacro("__CF__");
   // Condition here is aligned with the feature set of mapxf in Options.td
-  if (HasEGPR && HasPush2Pop2 && HasPPX && HasNDD)
+  if (HasEGPR && HasPush2Pop2 && HasPPX && HasNDD && HasCCMP && HasNF)
     Builder.defineMacro("__APX_F__");
 
   // Each case falls through to the previous one here.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5edf8c7970913..266bf41fd5577 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14074,7 +14074,7 @@ Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
   // Grab the appropriate field from __cpu_model.
   llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
                          ConstantInt::get(Int32Ty, Index)};
-  llvm::Value *CpuValue = Builder.CreateGEP(STy, CpuModel, Idxs);
+  llvm::Value *CpuValue = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs);
   CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue,
                                        CharUnits::fromQuantity(4));
 
@@ -14116,7 +14116,7 @@ CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
     // global in the struct STy.
     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3),
                      Builder.getInt32(0)};
-    Value *CpuFeatures = Builder.CreateGEP(STy, CpuModel, Idxs);
+    Value *CpuFeatures = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs);
     Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures,
                                                 CharUnits::fromQuantity(4));
 
@@ -14137,7 +14137,7 @@ CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
       continue;
     Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)};
     Value *Features = Builder.CreateAlignedLoad(
-        Int32Ty, Builder.CreateGEP(ATy, CpuFeatures2, Idxs),
+        Int32Ty, Builder.CreateInBoundsGEP(ATy, CpuFeatures2, Idxs),
         CharUnits::fromQuantity(4));
     // Check the value of the bit corresponding to the feature requested.
     Value *Mask = Builder.getInt32(M);
@@ -16724,7 +16724,7 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
       llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
                              ConstantInt::get(Int32Ty, FieldIdx)};
 
-      FieldValue = Builder.CreateGEP(STy, SysConf, Idxs);
+      FieldValue = Builder.CreateInBoundsGEP(STy, SysConf, Idxs);
       FieldValue = Builder.CreateAlignedLoad(Int32Ty, FieldValue,
                                              CharUnits::fromQuantity(4));
     } else if (SupportMethod == SYS_CALL) {
@@ -20806,6 +20806,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   }
   case WebAssembly::BI__builtin_wasm_min_f32:
   case WebAssembly::BI__builtin_wasm_min_f64:
+  case WebAssembly::BI__builtin_wasm_min_f16x8:
   case WebAssembly::BI__builtin_wasm_min_f32x4:
   case WebAssembly::BI__builtin_wasm_min_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
@@ -20816,6 +20817,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
   }
   case WebAssembly::BI__builtin_wasm_max_f32:
   case WebAssembly::BI__builtin_wasm_max_f64:
+  case WebAssembly::BI__builtin_wasm_max_f16x8:
   case WebAssembly::BI__builtin_wasm_max_f32x4:
   case WebAssembly::BI__builtin_wasm_max_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
@@ -20824,6 +20826,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType()));
     return Builder.CreateCall(Callee, {LHS, RHS});
   }
+  case WebAssembly::BI__builtin_wasm_pmin_f16x8:
   case WebAssembly::BI__builtin_wasm_pmin_f32x4:
   case WebAssembly::BI__builtin_wasm_pmin_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
@@ -20832,6 +20835,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType()));
     return Builder.CreateCall(Callee, {LHS, RHS});
   }
+  case WebAssembly::BI__builtin_wasm_pmax_f16x8:
   case WebAssembly::BI__builtin_wasm_pmax_f32x4:
   case WebAssembly::BI__builtin_wasm_pmax_f64x2: {
     Value *LHS = EmitScalarExpr(E->getArg(0));
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index bba00257fd4f0..7a92fc3dfb4a4 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -1789,7 +1789,6 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
     // Push a destructor if necessary.
     // FIXME: if we have an array of structures, all explicitly
     // initialized, we can end up pushing a linear number of cleanups.
-    bool pushedCleanup = false;
     if (QualType::DestructionKind dtorKind
           = field->getType().isDestructedType()) {
       assert(LV.isSimple());
@@ -1797,17 +1796,8 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
         CGF.pushDestroyAndDeferDeactivation(NormalAndEHCleanup, LV.getAddress(),
                                             field->getType(),
                                             CGF.getDestroyer(dtorKind), false);
-        pushedCleanup = true;
       }
     }
-
-    // If the GEP didn't get used because of a dead zero init or something
-    // else, clean it up for -O0 builds and general tidiness.
-    if (!pushedCleanup && LV.isSimple())
-      if (llvm::GetElementPtrInst *GEP =
-              dyn_cast<llvm::GetElementPtrInst>(LV.emitRawPointer(CGF)))
-        if (GEP->use_empty())
-          GEP->eraseFromParent();
   }
 }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e4774a587707a..0b0b659e1fd49 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -5341,6 +5341,18 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
       !IsDefinitionAvailableExternally &&
       D->needsDestruction(getContext()) == QualType::DK_cxx_destructor;
 
+  // It is helpless to emit the definition for an available_externally variable
+  // which can't be marked as const.
+  // We don't need to check if it needs global ctor or dtor. See the above
+  // comment for ideas.
+  if (IsDefinitionAvailableExternally &&
+      (!D->hasConstantInitialization() ||
+       // TODO: Update this when we have interface to check constexpr
+       // destructor.
+       D->needsDestruction(getContext()) ||
+       !D->getType().isConstantStorage(getContext(), true, true)))
+    return;
+
   const VarDecl *InitDecl;
   const Expr *InitExpr = D->getAnyInitializer(InitDecl);
 
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 76704c4d7be4a..db8e6f55302ad 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -1340,7 +1340,7 @@ void CodeGenPGO::setProfileVersion(llvm::Module &M) {
                                         llvm::APInt(64, ProfileVersion)),
         VarName);
 
-    IRLevelVersionVariable->setVisibility(llvm::GlobalValue::DefaultVisibility);
+    IRLevelVersionVariable->setVisibility(llvm::GlobalValue::HiddenVisibility);
     llvm::Triple TT(M.getTargetTriple());
     if (TT.supportsCOMDAT()) {
       IRLevelVersionVariable->setLinkage(llvm::GlobalValue::ExternalLinkage);
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 9849c59685cca..b141e5f2adfab 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -2227,10 +2227,19 @@ void Generic_GCC::GCCInstallationDetector::init(
   SmallVector<StringRef, 16> CandidateBiarchTripleAliases;
   // Add some triples that we want to check first.
   CandidateTripleAliases.push_back(TargetTriple.str());
-  std::string TripleNoVendor = TargetTriple.getArchName().str() + "-" +
-                               TargetTriple.getOSAndEnvironmentName().str();
-  if (TargetTriple.getVendor() == llvm::Triple::UnknownVendor)
+  std::string TripleNoVendor, BiarchTripleNoVendor;
+  if (TargetTriple.getVendor() == llvm::Triple::UnknownVendor) {
+    StringRef OSEnv = TargetTriple.getOSAndEnvironmentName();
+    if (TargetTriple.getEnvironment() == llvm::Triple::GNUX32)
+      OSEnv = "linux-gnu";
+    TripleNoVendor = (TargetTriple.getArchName().str() + '-' + OSEnv).str();
     CandidateTripleAliases.push_back(TripleNoVendor);
+    if (BiarchVariantTriple.getArch() != llvm::Triple::UnknownArch) {
+      BiarchTripleNoVendor =
+          (BiarchVariantTriple.getArchName().str() + '-' + OSEnv).str();
+      CandidateBiarchTripleAliases.push_back(BiarchTripleNoVendor);
+    }
+  }
 
   CollectLibDirsAndTriples(TargetTriple, BiarchVariantTriple, CandidateLibDirs,
                            CandidateTripleAliases, CandidateBiarchLibDirs,
@@ -2453,11 +2462,9 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
   // lists should shrink over time. Please don't add more elements to *Triples.
   static const char *const AArch64LibDirs[] = {"/lib64", "/lib"};
   static const char *const AArch64Triples[] = {
-      "aarch64-none-linux-gnu", "aarch64-linux-gnu", "aarch64-redhat-linux",
-      "aarch64-suse-linux"};
+      "aarch64-none-linux-gnu", "aarch64-redhat-linux", "aarch64-suse-linux"};
   static const char *const AArch64beLibDirs[] = {"/lib"};
-  static const char *const AArch64beTriples[] = {"aarch64_be-none-linux-gnu",
-                                                 "aarch64_be-linux-gnu"};
+  static const char *const AArch64beTriples[] = {"aarch64_be-none-linux-gnu"};
 
   static const char *const ARMLibDirs[] = {"/lib"};
   static const char *const ARMTriples[] = {"arm-linux-gnueabi"};
@@ -2482,9 +2489,8 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
       "x86_64-linux-gnu",       "x86_64-unknown-linux-gnu",
       "x86_64-pc-linux-gnu",    "x86_64-redhat-linux6E",
       "x86_64-redhat-linux",    "x86_64-suse-linux",
-      "x86_64-manbo-linux-gnu", "x86_64-linux-gnu",
-      "x86_64-slackware-linux", "x86_64-unknown-linux",
-      "x86_64-amazon-linux"};
+      "x86_64-manbo-linux-gnu", "x86_64-slackware-linux",
+      "x86_64-unknown-linux",   "x86_64-amazon-linux"};
   static const char *const X32Triples[] = {"x86_64-linux-gnux32",
                                            "x86_64-pc-linux-gnux32"};
   static const char *const X32LibDirs[] = {"/libx32", "/lib"};
@@ -2500,26 +2506,24 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
       "loongarch64-linux-gnu", "loongarch64-unknown-linux-gnu"};
 
   static const char *const M68kLibDirs[] = {"/lib"};
-  static const char *const M68kTriples[] = {
-      "m68k-linux-gnu", "m68k-unknown-linux-gnu", "m68k-suse-linux"};
+  static const char *const M68kTriples[] = {"m68k-unknown-linux-gnu",
+                                            "m68k-suse-linux"};
 
   static const char *const MIPSLibDirs[] = {"/libo32", "/lib"};
   static const char *const MIPSTriples[] = {
       "mips-linux-gnu", "mips-mti-linux", "mips-mti-linux-gnu",
       "mips-img-linux-gnu", "mipsisa32r6-linux-gnu"};
   static const char *const MIPSELLibDirs[] = {"/libo32", "/lib"};
-  static const char *const MIPSELTriples[] = {
-      "mipsel-linux-gnu", "mips-img-linux-gnu", "mipsisa32r6el-linux-gnu"};
+  static const char *const MIPSELTriples[] = {"mipsel-linux-gnu",
+                                              "mips-img-linux-gnu"};
 
   static const char *const MIPS64LibDirs[] = {"/lib64", "/lib"};
   static const char *const MIPS64Triples[] = {
-      "mips64-linux-gnu",      "mips-mti-linux-gnu",
-      "mips-img-linux-gnu",    "mips64-linux-gnuabi64",
+      "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64-linux-gnuabi64",
       "mipsisa64r6-linux-gnu", "mipsisa64r6-linux-gnuabi64"};
   static const char *const MIPS64ELLibDirs[] = {"/lib64", "/lib"};
   static const char *const MIPS64ELTriples[] = {
-      "mips64el-linux-gnu",      "mips-mti-linux-gnu",
-      "mips-img-linux-gnu",      "mips64el-linux-gnuabi64",
+      "mips-mti-linux-gnu", "mips-img-linux-gnu", "mips64el-linux-gnuabi64",
       "mipsisa64r6el-linux-gnu", "mipsisa64r6el-linux-gnuabi64"};
 
   static const char *const MIPSN32LibDirs[] = {"/lib32"};
@@ -2534,46 +2538,39 @@ void Generic_GCC::GCCInstallationDetector::AddDefaultGCCPrefixes(
 
   static const char *const PPCLibDirs[] = {"/lib32", "/lib"};
   static const char *const PPCTriples[] = {
-      "powerpc-linux-gnu", "powerpc-unknown-linux-gnu", "powerpc-linux-gnuspe",
+      "powerpc-unknown-linux-gnu",
       // On 32-bit PowerPC systems running SUSE Linux, gcc is configured as a
       // 64-bit compiler which defaults to "-m32", hence "powerpc64-suse-linux".
       "powerpc64-suse-linux", "powerpc-montavista-linuxspe"};
   static const char *const PPCLELibDirs[] = {"/lib32", "/lib"};
-  static const char *const PPCLETriples[] = {"powerpcle-linux-gnu",
-                                             "powerpcle-unknown-linux-gnu",
+  static const char *const PPCLETriples[] = {"powerpcle-unknown-linux-gnu",
                                              "powerpcle-linux-musl"};
 
   static const char *const PPC64LibDirs[] = {"/lib64", "/lib"};
-  static const char *const PPC64Triples[] = {
-      "powerpc64-linux-gnu", "powerpc64-unknown-linux-gnu",
-      "powerpc64-suse-linux", "ppc64-redhat-linux"};
+  static const char *const PPC64Triples[] = {"powerpc64-unknown-linux-gnu",
+                                             "powerpc64-suse-linux",
+                                             "ppc64-redhat-linux"};
   static const char *const PPC64LELibDirs[] = {"/lib64", "/lib"};
   static const char *const PPC64LETriples[] = {
-      "powerpc64le-linux-gnu", "powerpc64le-unknown-linux-gnu",
-      "powerpc64le-none-linux-gnu", "powerpc64le-suse-linux",
-      "ppc64le-redhat-linux"};
+      "powerpc64le-unknown-linux-gnu", "powerpc64le-none-linux-gnu",
+      "powerpc64le-suse-linux", "ppc64le-redhat-linux"};
 
   static const char *const RISCV32LibDirs[] = {"/lib32", "/lib"};
   static const char *const RISCV32Triples[] = {"riscv32-unknown-linux-gnu",
-                                               "riscv32-linux-gnu",
                                                "riscv32-unknown-elf"};
   static const char *const RISCV64LibDirs[] = {"/lib64", "/lib"};
   static const char *const RISCV64Triples[] = {"riscv64-unknown-linux-gnu",
-                                               "riscv64-linux-gnu",
                                                "riscv64-unknown-elf"};
 
   static const char *const SPARCv8LibDirs[] = {"/lib32", "/lib"};
-  static const char *const SPARCv8Triples[] = {"sparc-linux-gnu",
-                                               "sparcv8-linux-gnu"};
+  static const char *const SPARCv8Triples[] = {"sparcv8-linux-gnu"};
   static const char *const SPARCv9LibDirs[] = {"/lib64", "/lib"};
-  static const char *const SPARCv9Triples[] = {"sparc64-linux-gnu",
-                                               "sparcv9-linux-gnu"};
+  static const char *const SPARCv9Triples[] = {"sparcv9-linux-gnu"};
 
   static const char *const SystemZLibDirs[] = {"/lib64", "/lib"};
   static const char *const SystemZTriples[] = {
-      "s390x-linux-gnu", "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu",
-      "s390x-suse-linux", "s390x-redhat-linux"};
-
+      "s390x-unknown-linux-gnu", "s390x-ibm-linux-gnu", "s390x-suse-linux",
+      "s390x-redhat-linux"};
 
   using std::begin;
   using std::end;
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index b1ff697b368b1..f32a23f111e4b 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -106,9 +106,9 @@ class HIPUndefinedFatBinSymbols {
         std::string ID = IA->getId().str();
         if (!ID.empty()) {
           ID = llvm::utohexstr(llvm::MD5Hash(ID), /*LowerCase=*/true);
-          FatBinSymbols.insert(Twine(FatBinPrefix + "_" + ID).str());
+          FatBinSymbols.insert((FatBinPrefix + Twine('_') + ID).str());
           GPUBinHandleSymbols.insert(
-              Twine(GPUBinHandlePrefix + "_" + ID).str());
+              (GPUBinHandlePrefix + Twine('_') + ID).str());
           continue;
         }
         if (IA->getInputArg().getNumValues() == 0)
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index c98645993abe0..c7543a48c0b50 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2261,8 +2261,17 @@ bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
 
   unsigned PrefixLen = 0;
 
-  while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
+  while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) {
     ++PrefixLen;
+    if (!isLexingRawMode() &&
+        llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
+      const char *Pos = &CurPtr[PrefixLen];
+      Diag(Pos, LangOpts.CPlusPlus26
+                    ? diag::warn_cxx26_compat_raw_string_literal_character_set
+                    : diag::ext_cxx26_raw_string_literal_character_set)
+          << StringRef(Pos, 1);
+    }
+  }
 
   // If the last character was not a '(', then we didn't lex a valid delimiter.
   if (CurPtr[PrefixLen] != '(') {
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index e9c60f76165b6..63afc18783a1f 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -1347,11 +1347,13 @@ void Parser::ParseOpenACCCacheVarList() {
   ParseOpenACCVarList(OpenACCClauseKind::Invalid);
 }
 
-Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
-  SourceLocation StartLoc = getCurToken().getLocation();
+Parser::OpenACCDirectiveParseInfo
+Parser::ParseOpenACCDirective() {
+  SourceLocation StartLoc = ConsumeAnnotationToken();
+  SourceLocation DirLoc = getCurToken().getLocation();
   OpenACCDirectiveKind DirKind = ParseOpenACCDirectiveKind(*this);
 
-  getActions().OpenACC().ActOnConstruct(DirKind, StartLoc);
+  getActions().OpenACC().ActOnConstruct(DirKind, DirLoc);
 
   // Once we've parsed the construct/directive name, some have additional
   // specifiers that need to be taken care of. Atomic has an 'atomic-clause'
@@ -1390,7 +1392,7 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
       break;
     case OpenACCDirectiveKind::Wait:
       // OpenACC has an optional paren-wrapped 'wait-argument'.
-      if (ParseOpenACCWaitArgument(StartLoc, /*IsDirective=*/true).Failed)
+      if (ParseOpenACCWaitArgument(DirLoc, /*IsDirective=*/true).Failed)
         T.skipToEnd();
       else
         T.consumeClose();
@@ -1404,7 +1406,8 @@ Parser::OpenACCDirectiveParseInfo Parser::ParseOpenACCDirective() {
   }
 
   // Parses the list of clauses, if present, plus set up return value.
-  OpenACCDirectiveParseInfo ParseInfo{DirKind, StartLoc, SourceLocation{},
+  OpenACCDirectiveParseInfo ParseInfo{DirKind, StartLoc, DirLoc,
+                                      SourceLocation{},
                                       ParseOpenACCClauseList(DirKind)};
 
   assert(Tok.is(tok::annot_pragma_openacc_end) &&
@@ -1421,7 +1424,6 @@ Parser::DeclGroupPtrTy Parser::ParseOpenACCDirectiveDecl() {
   assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token");
 
   ParsingOpenACCDirectiveRAII DirScope(*this);
-  ConsumeAnnotationToken();
 
   OpenACCDirectiveParseInfo DirInfo = ParseOpenACCDirective();
 
@@ -1438,7 +1440,6 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() {
   assert(Tok.is(tok::annot_pragma_openacc) && "expected OpenACC Start Token");
 
   ParsingOpenACCDirectiveRAII DirScope(*this);
-  ConsumeAnnotationToken();
 
   OpenACCDirectiveParseInfo DirInfo = ParseOpenACCDirective();
   if (getActions().OpenACC().ActOnStartStmtDirective(DirInfo.DirKind,
@@ -1456,6 +1457,6 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() {
   }
 
   return getActions().OpenACC().ActOnEndStmtDirective(
-      DirInfo.DirKind, DirInfo.StartLoc, DirInfo.EndLoc, DirInfo.Clauses,
-      AssocStmt);
+      DirInfo.DirKind, DirInfo.StartLoc, DirInfo.DirLoc, DirInfo.EndLoc,
+      DirInfo.Clauses, AssocStmt);
 }
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index e959dd6378f46..cd8df3332724f 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -25,7 +25,6 @@
 #include "clang/Sema/SemaOpenMP.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/UniqueVector.h"
 #include "llvm/Frontend/OpenMP/OMPAssume.h"
 #include "llvm/Frontend/OpenMP/OMPContext.h"
 #include <optional>
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index b0af04451166c..c25203243ee49 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -239,7 +239,15 @@ StmtResult Parser::ParseStatementOrDeclarationAfterAttributes(
     auto IsStmtAttr = [](ParsedAttr &Attr) { return Attr.isStmtAttr(); };
     bool AllAttrsAreStmtAttrs = llvm::all_of(CXX11Attrs, IsStmtAttr) &&
                                 llvm::all_of(GNUAttrs, IsStmtAttr);
-    if (((GNUAttributeLoc.isValid() && !(HaveAttrs && AllAttrsAreStmtAttrs)) ||
+    // In C, the grammar production for statement (C23 6.8.1p1) does not allow
+    // for declarations, which is different from C++ (C++23 [stmt.pre]p1). So
+    // in C++, we always allow a declaration, but in C we need to check whether
+    // we're in a statement context that allows declarations. e.g., in C, the
+    // following is invalid: if (1) int x;
+    if ((getLangOpts().CPlusPlus || getLangOpts().MicrosoftExt ||
+         (StmtCtx & ParsedStmtContext::AllowDeclarationsInC) !=
+             ParsedStmtContext()) &&
+        ((GNUAttributeLoc.isValid() && !(HaveAttrs && AllAttrsAreStmtAttrs)) ||
          isDeclarationStatement())) {
       SourceLocation DeclStart = Tok.getLocation(), DeclEnd;
       DeclGroupPtrTy Decl;
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index d3e9dcb4f4399..6595abbcdda5b 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1444,10 +1444,10 @@ bool Sema::CheckCXXThisType(SourceLocation Loc, QualType Type) {
   //   category are defined within such member functions as they are within
   //   an implicit object member function).
   DeclContext *DC = getFunctionLevelDeclContext();
-  if (const auto *Method = dyn_cast<CXXMethodDecl>(DC);
-      Method && Method->isExplicitObjectMemberFunction()) {
+  const auto *Method = dyn_cast<CXXMethodDecl>(DC);
+  if (Method && Method->isExplicitObjectMemberFunction()) {
     Diag(Loc, diag::err_invalid_this_use) << 1;
-  } else if (isLambdaCallWithExplicitObjectParameter(CurContext)) {
+  } else if (Method && isLambdaCallWithExplicitObjectParameter(CurContext)) {
     Diag(Loc, diag::err_invalid_this_use) << 1;
   } else {
     Diag(Loc, diag::err_invalid_this_use) << 0;
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index ef0a655b631ab..be6ea20a956a3 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -5897,6 +5897,16 @@ void Sema::diagnoseTypo(const TypoCorrection &Correction,
 
   NamedDecl *ChosenDecl =
       Correction.isKeyword() ? nullptr : Correction.getFoundDecl();
+
+  // For builtin functions which aren't declared anywhere in source,
+  // don't emit the "declared here" note.
+  if (const auto *FD = dyn_cast_if_present<FunctionDecl>(ChosenDecl);
+      FD && FD->getBuiltinID() &&
+      PrevNote.getDiagID() == diag::note_previous_decl &&
+      Correction.getCorrectionRange().getBegin() == FD->getBeginLoc()) {
+    ChosenDecl = nullptr;
+  }
+
   if (PrevNote.getDiagID() && ChosenDecl)
     Diag(ChosenDecl->getLocation(), PrevNote)
       << CorrectedQuotedStr << (ErrorRecovery ? FixItHint() : FixTypo);
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 09d91b31cfe5f..15239f4f35c39 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -844,7 +844,7 @@ ExprResult SemaOpenACC::CheckReductionVar(Expr *VarExpr) {
 }
 
 void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
-                                 SourceLocation StartLoc) {
+                                 SourceLocation DirLoc) {
   switch (K) {
   case OpenACCDirectiveKind::Invalid:
     // Nothing to do here, an invalid kind has nothing we can check here.  We
@@ -859,7 +859,7 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
     // here as these constructs do not take any arguments.
     break;
   default:
-    Diag(StartLoc, diag::warn_acc_construct_unimplemented) << K;
+    Diag(DirLoc, diag::warn_acc_construct_unimplemented) << K;
     break;
   }
 }
@@ -1265,6 +1265,7 @@ bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K,
 
 StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K,
                                               SourceLocation StartLoc,
+                                              SourceLocation DirLoc,
                                               SourceLocation EndLoc,
                                               ArrayRef<OpenACCClause *> Clauses,
                                               StmtResult AssocStmt) {
@@ -1278,7 +1279,7 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K,
   case OpenACCDirectiveKind::Kernels:
     // TODO OpenACC: Add clauses to the construct here.
     return OpenACCComputeConstruct::Create(
-        getASTContext(), K, StartLoc, EndLoc, Clauses,
+        getASTContext(), K, StartLoc, DirLoc, EndLoc, Clauses,
         AssocStmt.isUsable() ? AssocStmt.get() : nullptr);
   }
   llvm_unreachable("Unhandled case in directive handling?");
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 8735d96c84079..6f538ed55cb72 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -285,7 +285,7 @@ bool Sema::CheckAlwaysInlineAttr(const Stmt *OrigSt, const Stmt *CurSt,
 static Attr *handleNoInlineAttr(Sema &S, Stmt *St, const ParsedAttr &A,
                                 SourceRange Range) {
   NoInlineAttr NIA(S.Context, A);
-  if (!NIA.isClangNoInline()) {
+  if (!NIA.isStmtNoInline()) {
     S.Diag(St->getBeginLoc(), diag::warn_function_attribute_ignored_in_stmt)
         << "[[clang::noinline]]";
     return nullptr;
@@ -684,10 +684,8 @@ ExprResult Sema::ActOnCXXAssumeAttr(Stmt *St, const ParsedAttr &A,
   }
 
   if (!getLangOpts().CPlusPlus23 &&
-      A.getSyntax() == AttributeCommonInfo::AS_CXX11) {
-    llvm::dbgs() << "Syntax: " << int(A.getSyntax()) << "\n";
+      A.getSyntax() == AttributeCommonInfo::AS_CXX11)
     Diag(A.getLoc(), diag::ext_cxx23_attr) << A << Range;
-  }
 
   return Assumption;
 }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index dee335b526991..765e6177d202d 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -4033,11 +4033,12 @@ class TreeTransform {
 
   StmtResult RebuildOpenACCComputeConstruct(OpenACCDirectiveKind K,
                                             SourceLocation BeginLoc,
+                                            SourceLocation DirLoc,
                                             SourceLocation EndLoc,
                                             ArrayRef<OpenACCClause *> Clauses,
                                             StmtResult StrBlock) {
-    return getSema().OpenACC().ActOnEndStmtDirective(K, BeginLoc, EndLoc,
-                                                     Clauses, StrBlock);
+    return getSema().OpenACC().ActOnEndStmtDirective(K, BeginLoc, DirLoc,
+                                                     EndLoc, Clauses, StrBlock);
   }
 
 private:
@@ -11559,8 +11560,8 @@ StmtResult TreeTransform<Derived>::TransformOpenACCComputeConstruct(
       getSema().OpenACC().ActOnAssociatedStmt(C->getDirectiveKind(), StrBlock);
 
   return getDerived().RebuildOpenACCComputeConstruct(
-      C->getDirectiveKind(), C->getBeginLoc(), C->getEndLoc(),
-      TransformedClauses, StrBlock);
+      C->getDirectiveKind(), C->getBeginLoc(), C->getDirectiveLoc(),
+      C->getEndLoc(), TransformedClauses, StrBlock);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index eac4faff28549..bea2b94989107 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2797,6 +2797,7 @@ void ASTStmtReader::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) {
   (void)Record.readInt();
   S->Kind = Record.readEnum<OpenACCDirectiveKind>();
   S->Range = Record.readSourceRange();
+  S->DirectiveLoc = Record.readSourceLocation();
   Record.readOpenACCClauseList(S->Clauses);
 }
 
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index dd548fabfd955..e830c4026ea78 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7835,7 +7835,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::If: {
     const auto *IC = cast<OpenACCIfClause>(C);
     writeSourceLocation(IC->getLParenLoc());
-    writeStmtRef(IC->getConditionExpr());
+    AddStmt(const_cast<Expr*>(IC->getConditionExpr()));
     return;
   }
   case OpenACCClauseKind::Self: {
@@ -7843,7 +7843,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeSourceLocation(SC->getLParenLoc());
     writeBool(SC->hasConditionExpr());
     if (SC->hasConditionExpr())
-      writeStmtRef(SC->getConditionExpr());
+      AddStmt(const_cast<Expr*>(SC->getConditionExpr()));
     return;
   }
   case OpenACCClauseKind::NumGangs: {
@@ -7857,13 +7857,13 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::NumWorkers: {
     const auto *NWC = cast<OpenACCNumWorkersClause>(C);
     writeSourceLocation(NWC->getLParenLoc());
-    writeStmtRef(NWC->getIntExpr());
+    AddStmt(const_cast<Expr*>(NWC->getIntExpr()));
     return;
   }
   case OpenACCClauseKind::VectorLength: {
     const auto *NWC = cast<OpenACCVectorLengthClause>(C);
     writeSourceLocation(NWC->getLParenLoc());
-    writeStmtRef(NWC->getIntExpr());
+    AddStmt(const_cast<Expr*>(NWC->getIntExpr()));
     return;
   }
   case OpenACCClauseKind::Private: {
@@ -7942,15 +7942,15 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeSourceLocation(AC->getLParenLoc());
     writeBool(AC->hasIntExpr());
     if (AC->hasIntExpr())
-      writeStmtRef(AC->getIntExpr());
+      AddStmt(const_cast<Expr*>(AC->getIntExpr()));
     return;
   }
   case OpenACCClauseKind::Wait: {
     const auto *WC = cast<OpenACCWaitClause>(C);
     writeSourceLocation(WC->getLParenLoc());
     writeBool(WC->getDevNumExpr());
-    if (const Expr *DNE = WC->getDevNumExpr())
-      writeStmtRef(DNE);
+    if (Expr *DNE = WC->getDevNumExpr())
+      AddStmt(DNE);
     writeSourceLocation(WC->getQueuesLoc());
 
     writeOpenACCIntExprList(WC->getQueueIdExprs());
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index a44852af97bea..3c586b270fbf4 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2847,6 +2847,7 @@ void ASTStmtWriter::VisitOpenACCConstructStmt(OpenACCConstructStmt *S) {
   Record.push_back(S->clauses().size());
   Record.writeEnum(S->Kind);
   Record.AddSourceRange(S->Range);
+  Record.AddSourceLocation(S->DirectiveLoc);
   Record.writeOpenACCClauseList(S->clauses());
 }
 
diff --git a/clang/test/C/C99/block-scopes.c b/clang/test/C/C99/block-scopes.c
index 589047df3e52b..116e5d922593e 100644
--- a/clang/test/C/C99/block-scopes.c
+++ b/clang/test/C/C99/block-scopes.c
@@ -18,8 +18,9 @@
 
 enum {a, b};
 void different(void) {
-  if (sizeof(enum {b, a}) != sizeof(int))
+  if (sizeof(enum {b, a}) != sizeof(int)) {
     _Static_assert(a == 1, "");
+  }
   /* In C89, the 'b' found here would have been from the enum declaration in
    * the controlling expression of the selection statement, not from the global
    * declaration. In C99 and later, that enumeration is scoped to the 'if'
diff --git a/clang/test/CodeGen/X86/x86-atomic-double.c b/clang/test/CodeGen/X86/x86-atomic-double.c
new file mode 100644
index 0000000000000..2354c89cc2b17
--- /dev/null
+++ b/clang/test/CodeGen/X86/x86-atomic-double.c
@@ -0,0 +1,104 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=X64 %s
+// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=X86 %s
+
+
+// X64-LABEL: define dso_local double @test_double_post_inc(
+// X64-SAME: ) #[[ATTR0:[0-9]+]] {
+// X64-NEXT:  entry:
+// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8
+// X64-NEXT:    ret double [[TMP1]]
+//
+// X86-LABEL: define dso_local double @test_double_post_inc(
+// X86-SAME: ) #[[ATTR0:[0-9]+]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_post_inc.n, float 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret double [[TMP1]]
+//
+double test_double_post_inc()
+{
+    static _Atomic double n;
+    return n++;
+}
+
+// X64-LABEL: define dso_local double @test_double_post_dc(
+// X64-SAME: ) #[[ATTR0]] {
+// X64-NEXT:  entry:
+// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 8
+// X64-NEXT:    ret double [[TMP1]]
+//
+// X86-LABEL: define dso_local double @test_double_post_dc(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_post_dc.n, float 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    store float [[TMP0]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret double [[TMP1]]
+//
+double test_double_post_dc()
+{
+    static _Atomic double n;
+    return n--;
+}
+
+// X64-LABEL: define dso_local double @test_double_pre_dc(
+// X64-SAME: ) #[[ATTR0]] {
+// X64-NEXT:  entry:
+// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, float 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 8
+// X64-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 8
+// X64-NEXT:    ret double [[TMP2]]
+//
+// X86-LABEL: define dso_local double @test_double_pre_dc(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_double_pre_dc.n, float 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
+// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret double [[TMP2]]
+//
+double test_double_pre_dc()
+{
+    static _Atomic double n;
+    return --n;
+}
+
+// X64-LABEL: define dso_local double @test_double_pre_inc(
+// X64-SAME: ) #[[ATTR0]] {
+// X64-NEXT:  entry:
+// X64-NEXT:    [[RETVAL:%.*]] = alloca double, align 8
+// X64-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, float 1.000000e+00 seq_cst, align 8
+// X64-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 8
+// X64-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 8
+// X64-NEXT:    ret double [[TMP2]]
+//
+// X86-LABEL: define dso_local double @test_double_pre_inc(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca double, align 4
+// X86-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_double_pre_inc.n, float 1.000000e+00 seq_cst, align 8
+// X86-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
+// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret double [[TMP2]]
+//
+double test_double_pre_inc()
+{
+    static _Atomic double n;
+    return ++n;
+}
diff --git a/clang/test/CodeGen/X86/x86-atomic-float.c b/clang/test/CodeGen/X86/x86-atomic-float.c
index 2d3c72d2a0299..6ee441c2dd7a8 100644
--- a/clang/test/CodeGen/X86/x86-atomic-float.c
+++ b/clang/test/CodeGen/X86/x86-atomic-float.c
@@ -1,11 +1,11 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK,CHECK64 %s
-// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefixes=CHECK,CHECK32 %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s
 
 
 // CHECK-LABEL: define dso_local i32 @test_int_inc(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw add ptr @test_int_inc.n, i32 1 seq_cst, align 4
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
@@ -17,7 +17,7 @@ int test_int_inc()
 
 // CHECK-LABEL: define dso_local float @test_float_post_inc(
 // CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_float_post_inc.n, float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    ret float [[TMP0]]
 //
@@ -29,7 +29,7 @@ float test_float_post_inc()
 
 // CHECK-LABEL: define dso_local float @test_float_post_dc(
 // CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_float_post_dc.n, float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    ret float [[TMP0]]
 //
@@ -41,7 +41,7 @@ float test_float_post_dc()
 
 // CHECK-LABEL: define dso_local float @test_float_pre_dc(
 // CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fsub ptr @test_float_pre_dc.n, float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = fsub float [[TMP0]], 1.000000e+00
 // CHECK-NEXT:    ret float [[TMP1]]
@@ -54,7 +54,7 @@ float test_float_pre_dc()
 
 // CHECK-LABEL: define dso_local float @test_float_pre_inc(
 // CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw fadd ptr @test_float_pre_inc.n, float 1.000000e+00 seq_cst, align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = fadd float [[TMP0]], 1.000000e+00
 // CHECK-NEXT:    ret float [[TMP1]]
@@ -64,6 +64,3 @@ float test_float_pre_inc()
     static _Atomic float n;
     return ++n;
 }
-//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-// CHECK32: {{.*}}
-// CHECK64: {{.*}}
diff --git a/clang/test/CodeGen/X86/x86-atomic-long_double.c b/clang/test/CodeGen/X86/x86-atomic-long_double.c
index 74a22d5db151e..2c3f381f13511 100644
--- a/clang/test/CodeGen/X86/x86-atomic-long_double.c
+++ b/clang/test/CodeGen/X86/x86-atomic-long_double.c
@@ -1,170 +1,171 @@
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck -check-prefix=CHECK32 %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck --check-prefixes=X64 %s
+// RUN: %clang_cc1 -triple i686-linux-gnu -target-cpu core2 %s -emit-llvm -o - | FileCheck --check-prefixes=X86 %s
 
-// CHECK-LABEL: define dso_local x86_fp80 @testinc(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// CHECK-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+// X64-LABEL: define dso_local x86_fp80 @testinc(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// X64-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// X64-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP3]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @testinc(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// CHECK32-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// CHECK32-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP3]]
+// X86-LABEL: define dso_local x86_fp80 @testinc(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// X86-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// X86-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP3]]
 //
 long double testinc(_Atomic long double *addr) {
 
   return ++*addr;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @testdec(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// CHECK-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP2]]
+// X64-LABEL: define dso_local x86_fp80 @testdec(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
+// X64-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP2]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @testdec(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// CHECK32-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+// X86-LABEL: define dso_local x86_fp80 @testdec(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP2]]
 //
 long double testdec(_Atomic long double *addr) {
 
   return (*addr)--;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @testcompassign(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    br label [[ATOMIC_OP:%.*]]
-// CHECK:       atomic_op:
-// CHECK-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[ATOMIC_OP]] ]
-// CHECK-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
-// CHECK-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
-// CHECK-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
-// CHECK-NEXT:    br i1 [[TMP7]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
-// CHECK:       atomic_cont:
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD4:%.*]] = load atomic i128, ptr [[TMP9]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP10]]
+// X64-LABEL: define dso_local x86_fp80 @testcompassign(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*]]:
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X64:       [[ATOMIC_OP]]:
+// X64-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ]
+// X64-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
+// X64-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
+// X64-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+// X64-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X64:       [[ATOMIC_CONT]]:
+// X64-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD4:%.*]] = load atomic i128, ptr [[TMP9]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16
+// X64-NEXT:    [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP10]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @testcompassign(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
-// CHECK32-NEXT:    br label [[ATOMIC_OP:%.*]]
-// CHECK32:       atomic_op:
-// CHECK32-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[ATOMIC_OP]] ]
-// CHECK32-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4
-// CHECK32-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
-// CHECK32-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    br i1 [[CALL]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
-// CHECK32:       atomic_cont:
-// CHECK32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP5]]
+// X86-LABEL: define dso_local x86_fp80 @testcompassign(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*]]:
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X86:       [[ATOMIC_OP]]:
+// X86-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ]
+// X86-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4
+// X86-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// X86-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X86:       [[ATOMIC_CONT]]:
+// X86-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5)
+// X86-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP5]]
 //
 long double testcompassign(_Atomic long double *addr) {
   *addr -= 25;
   return *addr;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @testassign(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    store atomic i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP2]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+// X64-LABEL: define dso_local x86_fp80 @testassign(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    store atomic i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic i128, ptr [[TMP2]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP3]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @testassign(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4
-// CHECK32-NEXT:    call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+// X86-LABEL: define dso_local x86_fp80 @testassign(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5)
+// X86-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP2]]
 //
 long double testassign(_Atomic long double *addr) {
   *addr = 115;
@@ -172,168 +173,168 @@ long double testassign(_Atomic long double *addr) {
   return *addr;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_inc(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// CHECK-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+// X64-LABEL: define dso_local x86_fp80 @test_volatile_inc(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// X64-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// X64-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP3]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_inc(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// CHECK32-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
-// CHECK32-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP3]]
+// X86-LABEL: define dso_local x86_fp80 @test_volatile_inc(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fadd ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// X86-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 1.000000e+00
+// X86-NEXT:    store float [[TMP2]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP3]]
 //
 long double test_volatile_inc(volatile _Atomic long double *addr) {
   return ++*addr;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_dec(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
-// CHECK-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP2]]
+// X64-LABEL: define dso_local x86_fp80 @test_volatile_dec(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 16
+// X64-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 16
+// X64-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP2]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_dec(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
-// CHECK32-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+// X86-LABEL: define dso_local x86_fp80 @test_volatile_dec(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[RETVAL:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = atomicrmw fsub ptr [[TMP0]], float 1.000000e+00 seq_cst, align 4
+// X86-NEXT:    store float [[TMP1]], ptr [[RETVAL]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[RETVAL]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP2]]
 //
 long double test_volatile_dec(volatile _Atomic long double *addr) {
   return (*addr)--;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_compassign(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    br label [[ATOMIC_OP:%.*]]
-// CHECK:       atomic_op:
-// CHECK-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[ATOMIC_OP]] ]
-// CHECK-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
-// CHECK-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
-// CHECK-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
-// CHECK-NEXT:    br i1 [[TMP7]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
-// CHECK:       atomic_cont:
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD4:%.*]] = load atomic volatile i128, ptr [[TMP9]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP10]]
+// X64-LABEL: define dso_local x86_fp80 @test_volatile_compassign(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*]]:
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X64:       [[ATOMIC_OP]]:
+// X64-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[ATOMIC_OP]] ]
+// X64-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP1]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP2]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[ATOMIC_TEMP2]], align 16
+// X64-NEXT:    [[TMP5:%.*]] = cmpxchg volatile ptr [[TMP0]], i128 [[TMP3]], i128 [[TMP4]] seq_cst seq_cst, align 16
+// X64-NEXT:    [[TMP6:%.*]] = extractvalue { i128, i1 } [[TMP5]], 0
+// X64-NEXT:    [[TMP7:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+// X64-NEXT:    store i128 [[TMP6]], ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    [[TMP8]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 16
+// X64-NEXT:    br i1 [[TMP7]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X64:       [[ATOMIC_CONT]]:
+// X64-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD4:%.*]] = load atomic volatile i128, ptr [[TMP9]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD4]], ptr [[ATOMIC_TEMP5]], align 16
+// X64-NEXT:    [[TMP10:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP5]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP10]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_compassign(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
-// CHECK32-NEXT:    br label [[ATOMIC_OP:%.*]]
-// CHECK32:       atomic_op:
-// CHECK32-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[ATOMIC_OP]] ]
-// CHECK32-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4
-// CHECK32-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
-// CHECK32-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    br i1 [[CALL]], label [[ATOMIC_CONT:%.*]], label [[ATOMIC_OP]]
-// CHECK32:       atomic_cont:
-// CHECK32-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP5]]
+// X86-LABEL: define dso_local x86_fp80 @test_volatile_compassign(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*]]:
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    br label %[[ATOMIC_OP:.*]]
+// X86:       [[ATOMIC_OP]]:
+// X86-NEXT:    [[TMP2:%.*]] = phi x86_fp80 [ [[TMP1]], %[[ENTRY]] ], [ [[TMP3:%.*]], %[[ATOMIC_OP]] ]
+// X86-NEXT:    [[SUB:%.*]] = fsub x86_fp80 [[TMP2]], 0xK4003C800000000000000
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[TMP2]], ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP2]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 [[SUB]], ptr [[ATOMIC_TEMP2]], align 4
+// X86-NEXT:    [[CALL:%.*]] = call zeroext i1 @__atomic_compare_exchange(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP1]], ptr noundef [[ATOMIC_TEMP2]], i32 noundef 5, i32 noundef 5)
+// X86-NEXT:    [[TMP3]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    br i1 [[CALL]], label %[[ATOMIC_CONT:.*]], label %[[ATOMIC_OP]]
+// X86:       [[ATOMIC_CONT]]:
+// X86-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP4]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef 5)
+// X86-NEXT:    [[TMP5:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP3]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP5]]
 //
 long double test_volatile_compassign(volatile _Atomic long double *addr) {
   *addr -= 25;
   return *addr;
 }
 
-// CHECK-LABEL: define dso_local x86_fp80 @test_volatile_assign(
-// CHECK-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
-// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false)
-// CHECK-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
-// CHECK-NEXT:    store atomic volatile i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
-// CHECK-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP2]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16
-// CHECK-NEXT:    ret x86_fp80 [[TMP3]]
+// X64-LABEL: define dso_local x86_fp80 @test_volatile_assign(
+// X64-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X64-NEXT:  [[ENTRY:.*:]]
+// X64-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// X64-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 16
+// X64-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[ATOMIC_TEMP]], i8 0, i64 16, i1 false)
+// X64-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
+// X64-NEXT:    store atomic volatile i128 [[TMP1]], ptr [[TMP0]] seq_cst, align 16
+// X64-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8
+// X64-NEXT:    [[ATOMIC_LOAD:%.*]] = load atomic volatile i128, ptr [[TMP2]] seq_cst, align 16
+// X64-NEXT:    store i128 [[ATOMIC_LOAD]], ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    [[TMP3:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 16
+// X64-NEXT:    ret x86_fp80 [[TMP3]]
 //
-// CHECK32-LABEL: define dso_local x86_fp80 @test_volatile_assign(
-// CHECK32-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
-// CHECK32-NEXT:  entry:
-// CHECK32-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
-// CHECK32-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false)
-// CHECK32-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4
-// CHECK32-NEXT:    call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
-// CHECK32-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5)
-// CHECK32-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
-// CHECK32-NEXT:    ret x86_fp80 [[TMP2]]
+// X86-LABEL: define dso_local x86_fp80 @test_volatile_assign(
+// X86-SAME: ptr noundef [[ADDR:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca x86_fp80, align 4
+// X86-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 12, i1 false)
+// X86-NEXT:    store x86_fp80 0xK4005E600000000000000, ptr [[ATOMIC_TEMP]], align 4
+// X86-NEXT:    call void @__atomic_store(i32 noundef 12, ptr noundef [[TMP0]], ptr noundef [[ATOMIC_TEMP]], i32 noundef 5)
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 4
+// X86-NEXT:    call void @__atomic_load(i32 noundef 12, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP1]], i32 noundef 5)
+// X86-NEXT:    [[TMP2:%.*]] = load x86_fp80, ptr [[ATOMIC_TEMP1]], align 4
+// X86-NEXT:    ret x86_fp80 [[TMP2]]
 //
 long double test_volatile_assign(volatile _Atomic long double *addr) {
   *addr = 115;
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
index 57ea4d2a1ac47..21a8229bbf244 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
@@ -1,14 +1,14 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
 // RUN:  -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
 // RUN:  -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sme.h>
@@ -745,3 +745,67 @@ svfloat32x4_t test_svclamp_single_f32_x4(svfloat32x4_t op1, svfloat32_t op2, svf
 svfloat64x4_t test_svclamp_single_f64_x4(svfloat64x4_t op1, svfloat64_t op2, svfloat64_t op3) __arm_streaming {
   return SVE_ACLE_FUNC(svclamp, _single_f64_x4, , )(op1, op2, op3);
 }
+
+// CHECK-LABEL: @test_svclamp_single_bf16_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
+// CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP6]]
+//
+svbfloat16x2_t test_svclamp_single_bf16_x2(svbfloat16x2_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming {
+  return SVE_ACLE_FUNC(svclamp, _single_bf16_x2, , )(op1, op2, op3);
+}
+
+// CHECK-LABEL: @test_svclamp_single_bf16_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], <vscale x 8 x bfloat> [[TMP11]], i64 24)
+// CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_tS0_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], <vscale x 8 x bfloat> [[TMP11]], i64 24)
+// CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP12]]
+//
+svbfloat16x4_t test_svclamp_single_bf16_x4(svbfloat16x4_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming {
+  return SVE_ACLE_FUNC(svclamp, _single_bf16_x4, , )(op1, op2, op3);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
index 4a5ee7e021f74..e26499d3a63cc 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
@@ -497,3 +497,25 @@ svuint8_t test_qcvt_u8_s32_x4(svint32x4_t zn) __arm_streaming {
 svuint16_t test_qcvt_u16_s64_x4(svint64x4_t zn) __arm_streaming {
   return SVE_ACLE_FUNC(svqcvt_u16,_s64_x4,,)(zn);
 }
+
+// CHECK-LABEL: @test_cvt_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z15test_cvt_f32_x2u13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+__attribute__((target("sme-f16f16"))) svfloat32x2_t test_cvt_f32_x2(svfloat16_t zn)  __arm_streaming {
+  return SVE_ACLE_FUNC(svcvt_f32,_f16_x2,,)(zn);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
new file mode 100644
index 0000000000000..453dd3db6adf0
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtl.c
@@ -0,0 +1,40 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme-f16f16 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: @test_cvtl_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+// CPP-CHECK-LABEL: @_Z16test_cvtl_f32_x2u13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> [[ZN:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP1]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], 1
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
+//
+svfloat32x2_t test_cvtl_f32_x2(svfloat16_t zn)  __arm_streaming {
+  return SVE_ACLE_FUNC(svcvtl_f32,_f16_x2,,)(zn);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c
new file mode 100644
index 0000000000000..2ad2044c267ed
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_zero.c
@@ -0,0 +1,139 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg1x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg1x2(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+   SVE_ACLE_FUNC(svzero_za64,_vg1x2)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg1x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg1x4(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg1x4)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg2x1(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg2x1j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg2x1(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg2x1)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg2x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg2x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg2x2(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg2x2)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg2x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg2x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg2x4(uint32_t slice)  __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg2x4)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg4x1(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg4x1j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg4x1(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg4x1)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg4x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg4x2j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg4x2(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg4x2)(slice);
+}
+
+// CHECK-LABEL: define dso_local void @test_svzero_za64_vg4x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 [[SLICE]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z22test_svzero_za64_vg4x4j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 [[SLICE]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svzero_za64_vg4x4(uint32_t slice) __arm_streaming __arm_inout("za"){
+   SVE_ACLE_FUNC(svzero_za64,_vg4x4)(slice);
+}
diff --git a/clang/test/CodeGen/attr-noinline.cpp b/clang/test/CodeGen/attr-noinline.cpp
index f0588cfecf463..c1fb9941b5251 100644
--- a/clang/test/CodeGen/attr-noinline.cpp
+++ b/clang/test/CodeGen/attr-noinline.cpp
@@ -9,6 +9,7 @@ static int baz(int x) {
 }
 
 [[clang::noinline]] bool noi() { }
+[[msvc::noinline]] bool ms_noi() { return true; }
 
 void foo(int i) {
   [[clang::noinline]] bar();
@@ -39,6 +40,31 @@ void foo(int i) {
 // CHECK: call noundef zeroext i1 @_Z3barv()
 }
 
+void ms_noi_check(int i) {
+  [[msvc::noinline]] bar();
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR:[0-9]+]]
+  [[msvc::noinline]] i = baz(i);
+// CHECK: call noundef i32 @_ZL3bazi({{.*}}) #[[NOINLINEATTR]]
+  [[msvc::noinline]] (i = 4, bar());
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+  [[msvc::noinline]] (void)(bar());
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+  [[msvc::noinline]] f(bar(), bar());
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+// CHECK: call void @_Z1fbb({{.*}}) #[[NOINLINEATTR]]
+  [[msvc::noinline]] [] { bar(); bar(); }(); // noinline only applies to the anonymous function call
+// CHECK: call void @"_ZZ12ms_noi_checkiENK3$_0clEv"(ptr {{[^,]*}} %ref.tmp) #[[NOINLINEATTR]]
+  [[msvc::noinline]] for (bar(); bar(); bar()) {}
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+// CHECK: call noundef zeroext i1 @_Z3barv() #[[NOINLINEATTR]]
+  [[msvc::noinline]] ms_noi();
+// CHECK: call noundef zeroext i1 @_Z6ms_noiv()
+  ms_noi();
+// CHECK: call noundef zeroext i1 @_Z6ms_noiv()
+}
+
 struct S {
   friend bool operator==(const S &LHS, const S &RHS);
 };
@@ -50,6 +76,12 @@ void func(const S &s1, const S &s2) {
   bool b;
   [[clang::noinline]] b = s1 == s2;
 // CHECK: call noundef zeroext i1 @_ZeqRK1SS1_({{.*}}) #[[NOINLINEATTR]]
+
+  [[msvc::noinline]]g(s1 == s2);
+// CHECK: call noundef zeroext i1 @_ZeqRK1SS1_({{.*}}) #[[NOINLINEATTR]]
+// CHECK: call void @_Z1gb({{.*}}) #[[NOINLINEATTR]]
+  [[msvc::noinline]] b = s1 == s2;
+// CHECK: call noundef zeroext i1 @_ZeqRK1SS1_({{.*}}) #[[NOINLINEATTR]]
 }
 
 // CHECK: attributes #[[NOINLINEATTR]] = { noinline }
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 93a6ab06081c9..d6ee4f68700dc 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -825,6 +825,30 @@ float extract_lane_f16x8(f16x8 a, int i) {
   // WEBASSEMBLY-NEXT: ret float %0
   return __builtin_wasm_extract_lane_f16x8(a, i);
 }
+
+f16x8 min_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.minimum.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_min_f16x8(a, b);
+}
+
+f16x8 max_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.maximum.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_max_f16x8(a, b);
+}
+
+f16x8 pmin_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.wasm.pmin.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_pmin_f16x8(a, b);
+}
+
+f16x8 pmax_f16x8(f16x8 a, f16x8 b) {
+  // WEBASSEMBLY:  %0 = tail call <8 x half> @llvm.wasm.pmax.v8f16(<8 x half> %a, <8 x half> %b)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_pmax_f16x8(a, b);
+}
 __externref_t externref_null() {
   return __builtin_wasm_ref_null_extern();
   // WEBASSEMBLY: tail call ptr addrspace(10) @llvm.wasm.ref.null.extern()
diff --git a/clang/test/CodeGenCXX/no-unique-address.cpp b/clang/test/CodeGenCXX/no-unique-address.cpp
index 7b4bbbf2a05d5..82532c5e1be82 100644
--- a/clang/test/CodeGenCXX/no-unique-address.cpp
+++ b/clang/test/CodeGenCXX/no-unique-address.cpp
@@ -101,3 +101,28 @@ struct HasZeroSizedFieldWithNonTrivialInit {
 HasZeroSizedFieldWithNonTrivialInit testHasZeroSizedFieldWithNonTrivialInit = {.a = 1};
 // CHECK-LABEL: define {{.*}}cxx_global_var_init
 // CHECK: call {{.*}}@_ZN14NonTrivialInitC1Ev({{.*}}@testHasZeroSizedFieldWithNonTrivialInit
+
+void *operator new(unsigned long, void *);
+template <class Ty>
+struct _box {
+  [[no_unique_address]] Ty _value;
+};
+// Make sure this doesn't crash.
+// CHECK-LABEL: define {{.*}}placement_new_struct
+void placement_new_struct() {
+  struct set_value_t {};
+
+  // GH88077
+  struct _tuple : _box<set_value_t>, _box<int> {};
+
+  int _storage[1];
+  new (_storage) _tuple{};
+
+  // GH89547
+  struct _tuple2 {
+    _box<set_value_t> a;
+  };
+
+  int _storage2[1];
+  new (_storage2) _tuple2{};
+}
diff --git a/clang/test/CodeGenCXX/partitions.cpp b/clang/test/CodeGenCXX/partitions.cpp
index d283dd071f6b2..e80e68f82974b 100644
--- a/clang/test/CodeGenCXX/partitions.cpp
+++ b/clang/test/CodeGenCXX/partitions.cpp
@@ -40,12 +40,12 @@ export int use() {
 }
 
 // FIXME: The definition of the variables shouldn't be exported too.
-// CHECK: @_ZW3mod1a = available_externally global
-// CHECK: @_ZW3mod1b = available_externally global
+// CHECK: @_ZW3mod1a = external global
+// CHECK: @_ZW3mod1b = external global
 // CHECK: declare{{.*}} i32 @_ZW3mod3foov
 // CHECK: declare{{.*}} i32 @_ZW3mod3barv
 
-// CHECK-OPT: @_ZW3mod1a = available_externally global
-// CHECK-OPT: @_ZW3mod1b = available_externally global
+// CHECK-OPT: @_ZW3mod1a = external global
+// CHECK-OPT: @_ZW3mod1b = external global
 // CHECK-OPT: declare{{.*}} i32 @_ZW3mod3foov
 // CHECK-OPT: declare{{.*}} i32 @_ZW3mod3barv
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 1d5f001c23fcc..3022ed1250d59 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -423,8 +423,8 @@
 // RUN: %clang -target x86_64-unknown-linux-gnu -mno-apxf -mapxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=APXF %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapxf -mno-apxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-APXF %s
 //
-// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd"
-// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd"
+// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf"
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=egpr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=push2pop2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PUSH2POP2 %s
diff --git a/clang/test/ExtractAPI/anonymous_record_no_typedef.c b/clang/test/ExtractAPI/anonymous_record_no_typedef.c
index 71e460afb1283..789316ca8930b 100644
--- a/clang/test/ExtractAPI/anonymous_record_no_typedef.c
+++ b/clang/test/ExtractAPI/anonymous_record_no_typedef.c
@@ -84,21 +84,15 @@ struct Vehicle {
     // TYPE: "text": "The type of vehicle."
     // TYPE: "title": "type"
 
-    // BICYCLE: "!testRelLabel": "memberOf $ c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Bicycle $ c:@S@Vehicle@FI@type"
     // BICYCLE-LABEL: "!testLabel": "c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Bicycle"
     // BICYCLE: "title": "Bicycle"
     // BICYCLE:      "pathComponents": [
-    // BICYCLE-NEXT:   "Vehicle",
-    // BICYCLE-NEXT:   "type",
     // BICYCLE-NEXT:   "Bicycle"
     // BICYCLE-NEXT: ]
 
-    // CAR: "!testRelLabel": "memberOf $ c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Car $ c:@S@Vehicle@FI@type"
     // CAR-LABEL: "!testLabel": "c:@S@Vehicle@E@anonymous_record_no_typedef.c@{{[0-9]+}}@Car"
     // CAR: "title": "Car"
     // CAR:      "pathComponents": [
-    // CAR-NEXT:   "Vehicle",
-    // CAR-NEXT:   "type",
     // CAR-NEXT:   "Car"
     // CAR-NEXT: ]
 
@@ -151,32 +145,22 @@ struct Vehicle {
     // NAME-NEXT: ]
 };
 
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALENUM
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALCASE
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALOTHERCASE
 enum {
   GlobalCase,
   GlobalOtherCase
 };
-// GLOBALENUM-DAG: "!testRelLabel": "memberOf $ c:@Ea@GlobalCase@GlobalCase $ c:@Ea@GlobalCase"
-// GLOBALENUM-DAG: "!testRelLabel": "memberOf $ c:@Ea@GlobalCase@GlobalOtherCase $ c:@Ea@GlobalCase"
-// GLOBALENUM-LABEL: "!testLabel": "c:@Ea@GlobalCase"
-// GLOBALENUM:      "declarationFragments": [
-// GLOBALENUM-NEXT:   {
-// GLOBALENUM-NEXT:     "kind": "keyword",
-// GLOBALENUM-NEXT:     "spelling": "enum"
-// GLOBALENUM-NEXT:   },
-// GLOBALENUM-NEXT:   {
-// GLOBALENUM-NEXT:     "kind": "text",
-// GLOBALENUM-NEXT:     "spelling": " : "
-// GLOBALENUM-NEXT:   },
-// GLOBALENUM-NEXT:   {
-// GLOBALENUM-NEXT:     "kind": "typeIdentifier",
-// GLOBALENUM-NEXT:     "preciseIdentifier": "c:i",
-// GLOBALENUM-NEXT:     "spelling": "unsigned int"
-// GLOBALENUM-NEXT:   },
-// GLOBALENUM-NEXT:   {
-// GLOBALENUM-NEXT:     "kind": "text",
-// GLOBALENUM-NEXT:     "spelling": " { ... };"
-// GLOBALENUM-NEXT:   }
-// GLOBALENUM-NEXT: ]
+// GLOBALCASE-LABEL: "!testLabel": "c:@Ea@GlobalCase@GlobalCase"
+// GLOBALCASE: "title": "GlobalCase"
+// GLOBALCASE:      "pathComponents": [
+// GLOBALCASE-NEXT:   "GlobalCase"
+// GLOBALCASE-NEXT: ]
+
+// GLOBALOTHERCASE-LABEL: "!testLabel": "c:@Ea@GlobalCase@GlobalOtherCase"
+// GLOBALOTHERCASE: "title": "GlobalOtherCase"
+// GLOBALOTHERCASE:      "pathComponents": [
+// GLOBALOTHERCASE-NEXT:   "GlobalOtherCase"
+// GLOBALOTHERCASE-NEXT: ]
 
 // expected-no-diagnostics
diff --git a/clang/test/ExtractAPI/enum.c b/clang/test/ExtractAPI/enum.c
index 67e003834a7d5..58170aa0e1d90 100644
--- a/clang/test/ExtractAPI/enum.c
+++ b/clang/test/ExtractAPI/enum.c
@@ -115,18 +115,6 @@ enum {
       "source": "c:@E@Direction@West",
       "target": "c:@E@Direction",
       "targetFallback": "Direction"
-    },
-    {
-      "kind": "memberOf",
-      "source": "c:@Ea@Constant@Constant",
-      "target": "c:@Ea@Constant",
-      "targetFallback": "enum (unnamed)"
-    },
-    {
-      "kind": "memberOf",
-      "source": "c:@Ea@OtherConstant@OtherConstant",
-      "target": "c:@Ea@OtherConstant",
-      "targetFallback": "enum (unnamed)"
     }
   ],
   "symbols": [
@@ -677,55 +665,6 @@ enum {
         "West"
       ]
     },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "enum"
-        },
-        {
-          "kind": "text",
-          "spelling": " : "
-        },
-        {
-          "kind": "typeIdentifier",
-          "preciseIdentifier": "c:i",
-          "spelling": "unsigned int"
-        },
-        {
-          "kind": "text",
-          "spelling": " { ... };"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "c",
-        "precise": "c:@Ea@Constant"
-      },
-      "kind": {
-        "displayName": "Enumeration",
-        "identifier": "c.enum"
-      },
-      "location": {
-        "position": {
-          "character": 0,
-          "line": 16
-        },
-        "uri": "file://INPUT_DIR/input.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "enum (unnamed)"
-          }
-        ],
-        "title": "enum (unnamed)"
-      },
-      "pathComponents": [
-        "enum (unnamed)"
-      ]
-    },
     {
       "accessLevel": "public",
       "declarationFragments": [
@@ -765,59 +704,9 @@ enum {
         "title": "Constant"
       },
       "pathComponents": [
-        "enum (unnamed)",
         "Constant"
       ]
     },
-    {
-      "accessLevel": "public",
-      "declarationFragments": [
-        {
-          "kind": "keyword",
-          "spelling": "enum"
-        },
-        {
-          "kind": "text",
-          "spelling": " : "
-        },
-        {
-          "kind": "typeIdentifier",
-          "preciseIdentifier": "c:i",
-          "spelling": "unsigned int"
-        },
-        {
-          "kind": "text",
-          "spelling": " { ... };"
-        }
-      ],
-      "identifier": {
-        "interfaceLanguage": "c",
-        "precise": "c:@Ea@OtherConstant"
-      },
-      "kind": {
-        "displayName": "Enumeration",
-        "identifier": "c.enum"
-      },
-      "location": {
-        "position": {
-          "character": 0,
-          "line": 20
-        },
-        "uri": "file://INPUT_DIR/input.h"
-      },
-      "names": {
-        "navigator": [
-          {
-            "kind": "identifier",
-            "spelling": "enum (unnamed)"
-          }
-        ],
-        "title": "enum (unnamed)"
-      },
-      "pathComponents": [
-        "enum (unnamed)"
-      ]
-    },
     {
       "accessLevel": "public",
       "declarationFragments": [
@@ -857,7 +746,6 @@ enum {
         "title": "OtherConstant"
       },
       "pathComponents": [
-        "enum (unnamed)",
         "OtherConstant"
       ]
     }
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 1271868a53b86..26da82843c512 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -231,7 +231,7 @@ extern "C" __device__ uint64_t test___make_mantissa(const char *p) {
 
 // CHECK-LABEL: @test_abs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i32 0, -2147483648) i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true)
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 extern "C" __device__ int test_abs(int x) {
@@ -240,7 +240,7 @@ extern "C" __device__ int test_abs(int x) {
 
 // CHECK-LABEL: @test_labs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
 // CHECK-NEXT:    ret i64 [[TMP0]]
 //
 extern "C" __device__ long test_labs(long x) {
@@ -249,7 +249,7 @@ extern "C" __device__ long test_labs(long x) {
 
 // CHECK-LABEL: @test_llabs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef range(i64 0, -9223372036854775808) i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
 // CHECK-NEXT:    ret i64 [[TMP0]]
 //
 extern "C" __device__ long long test_llabs(long x) {
diff --git a/clang/test/Lexer/cxx2c-raw-strings.cpp b/clang/test/Lexer/cxx2c-raw-strings.cpp
new file mode 100644
index 0000000000000..569a4b8447e57
--- /dev/null
+++ b/clang/test/Lexer/cxx2c-raw-strings.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -Wc++26-extensions %s
+// RUN: %clang_cc1 -std=c++2c -fsyntax-only -verify=cxx26 -Wpre-c++26-compat %s
+
+int main() {
+  (void) R"abc`@$(foobar)abc`@$";
+  //expected-warning@-1 {{'`' in a raw string literal delimiter is a C++2c extension}}
+  //expected-warning@-2 {{'@' in a raw string literal delimiter is a C++2c extension}}
+  //expected-warning@-3 {{'$' in a raw string literal delimiter is a C++2c extension}}
+  //cxx26-warning@-4 {{'`' in a raw string literal delimiter is incompatible with standards before C++2c}}
+  //cxx26-warning@-5 {{'@' in a raw string literal delimiter is incompatible with standards before C++2c}}
+  //cxx26-warning@-6 {{'$' in a raw string literal delimiter is incompatible with standards before C++2c}}
+}
diff --git a/clang/test/Modules/pr93497.cppm b/clang/test/Modules/pr93497.cppm
new file mode 100644
index 0000000000000..64a08e2a85e63
--- /dev/null
+++ b/clang/test/Modules/pr93497.cppm
@@ -0,0 +1,106 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/mod.cppm \
+// RUN:     -emit-module-interface -o %t/mod.pcm
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/use.cpp \
+// RUN:     -fmodule-file=mod=%t/mod.pcm -emit-llvm \
+// RUN:     -o - | opt -S --passes=simplifycfg | FileCheck %t/use.cpp
+
+//--- mod.cppm
+export module mod;
+
+export struct Thing {
+    static const Thing One;
+    explicit Thing(int raw) :raw(raw) { }
+    int raw;
+};
+
+const Thing Thing::One = Thing(1);
+
+export struct C {
+    int value;
+};
+export const C ConstantValue = {1};
+
+export const C *ConstantPtr = &ConstantValue;
+
+C NonConstantValue = {1};
+export const C &ConstantRef = NonConstantValue;
+
+export struct NonConstexprDtor {
+    constexpr NonConstexprDtor(int raw) : raw(raw) {}
+    ~NonConstexprDtor();
+
+    int raw;
+};
+
+export const NonConstexprDtor NonConstexprDtorValue = {1};
+
+//--- use.cpp
+import mod;
+
+int consume(int);
+int consumeC(C);
+
+extern "C" __attribute__((noinline)) inline int unneeded() {
+    return consume(43);
+}
+
+extern "C" __attribute__((noinline)) inline int needed() {
+    return consume(43);
+}
+
+int use() {
+    Thing t1 = Thing::One;
+    return consume(t1.raw);
+}
+
+int use2() {
+    if (ConstantValue.value)
+        return consumeC(ConstantValue);
+    return unneeded();
+}
+
+int use3() {
+    auto Ptr = ConstantPtr;
+    if (Ptr->value)
+        return consumeC(*Ptr);
+    return needed();
+}
+
+int use4() {
+    auto Ref = ConstantRef;
+    if (Ref.value)
+        return consumeC(Ref);
+    return needed();
+}
+
+int use5() {
+    NonConstexprDtor V = NonConstexprDtorValue;
+    if (V.raw)
+        return consume(V.raw);
+    return needed();
+}
+
+// CHECK: @_ZNW3mod5Thing3OneE = external
+// CHECK: @_ZW3mod13ConstantValue ={{.*}}available_externally{{.*}} constant 
+// CHECK: @_ZW3mod11ConstantPtr = external
+// CHECK: @_ZW3mod16NonConstantValue = external
+// CHECK: @_ZW3mod21NonConstexprDtorValue = external
+
+// Check that the middle end can optimize the program by the constant information.
+// CHECK-NOT: @unneeded(
+
+// Check that the use of ConstantPtr won't get optimized incorrectly.
+// CHECK-LABEL: @_Z4use3v(
+// CHECK: @needed(
+
+// Check that the use of ConstantRef won't get optimized incorrectly.
+// CHECK-LABEL: @_Z4use4v(
+// CHECK: @needed(
+
+// Check that the use of NonConstexprDtorValue won't get optimized incorrectly.
+// CHECK-LABEL: @_Z4use5v(
+// CHECK: @needed(
diff --git a/clang/test/Parser/decls.c b/clang/test/Parser/decls.c
new file mode 100644
index 0000000000000..39ef05bf4bd99
--- /dev/null
+++ b/clang/test/Parser/decls.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 %s -fsyntax-only -verify -pedantic
+
+// Test that we can parse declarations at global scope.
+int v;
+
+void func(void) {
+  // Test that we can parse declarations within a compound statement.
+  int a;
+  {
+    int b;
+  }
+
+  int z = ({ // expected-warning {{use of GNU statement expression extension}}
+	// Test that we can parse declarations within a GNU statement expression.
+	int w = 12;
+	w;
+  });
+
+  // Test that we diagnose declarations where a statement is required.
+  // See GH92775.
+  if (1)
+    int x; // expected-error {{expected expression}}
+  for (;;)
+    int c; // expected-error {{expected expression}}
+
+  label:
+    int y; // expected-warning {{label followed by a declaration is a C23 extension}}
+
+  // Test that lookup works as expected.
+  (void)a;
+  (void)v;
+  (void)z;
+  (void)b; // expected-error {{use of undeclared identifier 'b'}}
+  (void)w; // expected-error {{use of undeclared identifier 'w'}}
+  (void)x; // expected-error {{use of undeclared identifier 'x'}}
+  (void)c; // expected-error {{use of undeclared identifier 'c'}}
+  (void)y;
+}
+
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 7567267be26b4..6c08b379c9386 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -754,7 +754,7 @@
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ccmp -x c -E -dM -o - %s | FileCheck --check-prefix=CCMP %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=nf -x c -E -dM -o - %s | FileCheck --check-prefix=NF %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=cf -x c -E -dM -o - %s | FileCheck --check-prefix=CF %s
-// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,APXF %s
+// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,CCMP,NF,APXF %s
 // APXF: #define __APX_F__ 1
 // CCMP: #define __CCMP__ 1
 // CF: #define __CF__ 1
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp
new file mode 100644
index 0000000000000..62a1f8e6de1d7
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp
@@ -0,0 +1,13 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -emit-llvm-only -verify -verify-ignore-unexpected=error,note -o - %s
+
+#include <arm_sme.h>
+
+void test_b16b16( svbfloat16_t bf16, svbfloat16x2_t bf16x2, svbfloat16x4_t bf16x4) __arm_streaming
+{
+  // expected-error@+1 {{'svclamp_single_bf16_x2' needs target feature sme2,b16b16}}
+  svclamp_single_bf16_x2(bf16x2, bf16, bf16);
+  // expected-error@+1 {{'svclamp_single_bf16_x4' needs target feature sme2,b16b16}}
+  svclamp_single_bf16_x4(bf16x4, bf16, bf16);
+}
\ No newline at end of file
diff --git a/clang/test/Sema/attr-noinline.cpp b/clang/test/Sema/attr-noinline.cpp
index bd6505b9fe98e..6da0e873af1b6 100644
--- a/clang/test/Sema/attr-noinline.cpp
+++ b/clang/test/Sema/attr-noinline.cpp
@@ -2,9 +2,9 @@
 
 int bar();
 
-// expected-note@+1{{conflicting attribute is here}}
+// expected-note@+1 2 {{conflicting attribute is here}}
 [[gnu::always_inline]] void always_inline_fn(void) { }
-// expected-note@+1{{conflicting attribute is here}}
+// expected-note@+1 2 {{conflicting attribute is here}}
 [[gnu::flatten]] void flatten_fn(void) { }
 [[gnu::noinline]] void noinline_fn(void) { }
 
@@ -25,7 +25,21 @@ void foo() {
   __attribute__((noinline)) bar(); // expected-warning {{attribute is ignored on this statement as it only applies to functions; use '[[clang::noinline]]' on statements}}
 }
 
+void ms_noi_check() {
+  [[msvc::noinline]] bar();
+  [[msvc::noinline(0)]] bar(); // expected-error {{'noinline' attribute takes no arguments}}
+  int x;
+  [[msvc::noinline]] x = 0; // expected-warning {{'noinline' attribute is ignored because there exists no call expression inside the statement}}
+  [[msvc::noinline]] { asm("nop"); } // expected-warning {{'noinline' attribute is ignored because there exists no call expression inside the statement}}
+  [[msvc::noinline]] label: x = 1; // expected-warning {{'noinline' attribute only applies to functions and statements}}
+
+  [[msvc::noinline]] always_inline_fn(); // expected-warning {{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+  [[msvc::noinline]] flatten_fn(); // expected-warning {{statement attribute 'noinline' has higher precedence than function attribute 'flatten'}}
+  [[msvc::noinline]] noinline_fn();
+}
+
 [[clang::noinline]] static int i = bar(); // expected-warning {{'noinline' attribute only applies to functions and statements}}
+[[msvc::noinline]] static int j = bar(); // expected-warning {{'noinline' attribute only applies to functions and statements}}
 
 // This used to crash the compiler.
 template<int D>
@@ -69,7 +83,39 @@ int variadic_baz(int x) {
   [[clang::noinline]] return non_dependent(x) + (dependent<D>(x) + ...);
 }
 
+template<int D> [[clang::always_inline]]
+int qux(int x) { // #QUX
+  // expected-warning@+2{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+  // expected-note@#NO_DEP{{conflicting attribute is here}}
+  [[msvc::noinline]] non_dependent(x);
+  if constexpr (D>0) {
+    // expected-warning@+6{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+    // expected-note@#NO_DEP{{conflicting attribute is here}}
+    // expected-warning@+4 3{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+    // expected-note@#QUX 3{{conflicting attribute is here}}
+    // expected-note@#QUX_INST 3{{in instantiation}}
+    // expected-note@+1 3{{in instantiation}}
+    [[msvc::noinline]] return non_dependent(x), qux<D-1>(x + 1);
+  }
+  return x;
+}
+
+// We can't suppress if there is a variadic involved.
+template<int ... D>
+int variadic_qux(int x) {
+  // Diagnoses NO_DEP 2x, once during phase 1, the second during instantiation.
+  // Dianoses DEP 3x, once per variadic expansion.
+  // expected-warning@+5 2{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+  // expected-note@#NO_DEP 2{{conflicting attribute is here}}
+  // expected-warning@+3 3{{statement attribute 'noinline' has higher precedence than function attribute 'always_inline'}}
+  // expected-note@#DEP 3{{conflicting attribute is here}}
+  // expected-note@#QUX_VARIADIC_INST{{in instantiation}}
+  [[msvc::noinline]] return non_dependent(x) + (dependent<D>(x) + ...);
+}
+
 void use() {
   baz<3>(0); // #BAZ_INST
   variadic_baz<0, 1, 2>(0); // #VARIADIC_INST
+  qux<3>(0); // #QUX_INST
+  variadic_qux<0, 1, 2>(0); // #QUX_VARIADIC_INST
 }
diff --git a/clang/test/SemaCXX/invalid-if-constexpr.cpp b/clang/test/SemaCXX/invalid-if-constexpr.cpp
index 7643c47488f05..0007f2739cbbd 100644
--- a/clang/test/SemaCXX/invalid-if-constexpr.cpp
+++ b/clang/test/SemaCXX/invalid-if-constexpr.cpp
@@ -4,8 +4,7 @@ namespace GH61885 {
 void similar() { // expected-note {{'similar' declared here}}
   if constexpr (similer<>) {} // expected-error {{use of undeclared identifier 'similer'; did you mean 'similar'?}}
 }
-void a() { if constexpr (__adl_swap<>) {}} // expected-error{{use of undeclared identifier '__adl_swap'; did you mean '__sync_swap'?}} \
-                                           // expected-note {{'__sync_swap' declared here}}
+void a() { if constexpr (__adl_swap<>) {}} // expected-error{{use of undeclared identifier '__adl_swap'; did you mean '__sync_swap'?}}
 
 int AA() { return true;} // expected-note {{'AA' declared here}}
 
diff --git a/clang/test/SemaCXX/invalid-this-in-lambda.cpp b/clang/test/SemaCXX/invalid-this-in-lambda.cpp
new file mode 100644
index 0000000000000..ae65bda025e23
--- /dev/null
+++ b/clang/test/SemaCXX/invalid-this-in-lambda.cpp
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
+decltype([]()->decltype(this) { }) a; // expected-error {{invalid use of 'this' outside of a non-static member function}}
+
diff --git a/clang/test/SemaCXX/typo-correction-builtin-func.cpp b/clang/test/SemaCXX/typo-correction-builtin-func.cpp
new file mode 100644
index 0000000000000..8d369034d1be3
--- /dev/null
+++ b/clang/test/SemaCXX/typo-correction-builtin-func.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// Test that clang does not emit 'declared here' note for builtin functions that don't have a declaration in source.
+
+void t0() {
+  constexpr float A = __builtin_isinfinity(); // expected-error {{use of undeclared identifier '__builtin_isinfinity'; did you mean '__builtin_isfinite'?}}
+                                              // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
diff --git a/clang/test/SemaOpenACC/parallel-loc-and-stmt.c b/clang/test/SemaOpenACC/parallel-loc-and-stmt.c
index ba29f6da8ba25..bbcdd823483a5 100644
--- a/clang/test/SemaOpenACC/parallel-loc-and-stmt.c
+++ b/clang/test/SemaOpenACC/parallel-loc-and-stmt.c
@@ -33,9 +33,11 @@ int foo3;
 
 void func() {
   // FIXME: Should we disallow this on declarations, or consider this to be on
-  // the initialization?
+  // the initialization? This is currently rejected in C because
+  // Parser::ParseOpenACCDirectiveStmt() calls ParseStatement() and passes the
+  // statement context as "SubStmt" which does not allow for a declaration in C.
 #pragma acc parallel
-  int foo;
+  int foo; // expected-error {{expected expression}}
 
 #pragma acc parallel
   {
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
index 9fb6b440b6b2a..e74c031eba4c1 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++20 -Wconversion -verify %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++2c -Wconversion -verify %s
 
 struct Test {
     int a = 0;
@@ -102,3 +102,24 @@ void bar() {
 }
 
 }
+
+namespace GH84052 {
+
+template <class... T>
+concept C = sizeof(T...[1]) == 1; // #C
+
+struct A {};
+
+template <class T, C<T> auto = A{}> struct Set {}; // #Set
+
+template <class T> void foo() {
+  Set<T> unrelated;
+}
+
+Set<bool> sb;
+Set<float> sf;
+// expected-error@-1 {{constraints not satisfied for class template 'Set'}}
+// expected-note@#Set {{because 'C<decltype(GH84052::A{}), float>' evaluated to false}}
+// expected-note@#C {{evaluated to false}}
+
+} // namespace GH84052
diff --git a/clang/test/TableGen/deferred-diag.td b/clang/test/TableGen/deferred-diag.td
index c1906d4a9e45e..d7e8e694c7b3e 100644
--- a/clang/test/TableGen/deferred-diag.td
+++ b/clang/test/TableGen/deferred-diag.td
@@ -4,24 +4,24 @@ include "DiagnosticBase.inc"
 
 // Test usage of Deferrable and NonDeferrable in diagnostics.
 
-def test_default : Error<"This error is non-deferrable by default">;
+def test_default : Error<"this error is non-deferrable by default">;
 // CHECK-DAG: DIAG(test_default, {{.*}}SFINAE_SubstitutionFailure, false, true, true, false, 0)
 
-def test_deferrable : Error<"This error is deferrable">, Deferrable;
+def test_deferrable : Error<"this error is deferrable">, Deferrable;
 // CHECK-DAG: DIAG(test_deferrable, {{.*}} SFINAE_SubstitutionFailure, false, true, true, true, 0)
 
-def test_non_deferrable : Error<"This error is non-deferrable">, NonDeferrable;
+def test_non_deferrable : Error<"this error is non-deferrable">, NonDeferrable;
 // CHECK-DAG: DIAG(test_non_deferrable, {{.*}} SFINAE_SubstitutionFailure, false, true, true, false, 0)
 
 let Deferrable = 1 in {
 
-def test_let : Error<"This error is deferrable by let">;
+def test_let : Error<"this error is deferrable by let">;
 // CHECK-DAG: DIAG(test_let, {{.*}} SFINAE_SubstitutionFailure, false, true, true, true, 0)
 
 // Make sure TextSubstitution is allowed in the let Deferrable block.
 def textsub : TextSubstitution<"%select{text1|text2}0">;
 
-def test_let2 : Error<"This error is deferrable by let %sub{textsub}0">;
+def test_let2 : Error<"this error is deferrable by let %sub{textsub}0">;
 // CHECK-DAG: DIAG(test_let2, {{.*}} SFINAE_SubstitutionFailure, false, true, true, true, 0)
 
 }
diff --git a/clang/test/TableGen/text-substitution.td b/clang/test/TableGen/text-substitution.td
index aafdbe48c43be..b0d030aca6513 100644
--- a/clang/test/TableGen/text-substitution.td
+++ b/clang/test/TableGen/text-substitution.td
@@ -26,8 +26,8 @@ def sub_test_rewrite : TextSubstitution<
 // CHECK-SAME: Q! %q1.
 // CHECK-SAME: PLACEHOLDER! %0.OBJCCLASS!
 // CHECK-SAME: %objcclass5. OBJCINSTANCE!
-// CHECK-SAME: %objcinstance4.  DONE!",
-def test_rewrite: Error<"%sub{sub_test_rewrite}5,4,3,2,1,0 DONE!">;
+// CHECK-SAME: %objcinstance4.  DONE",
+def test_rewrite: Error<"%sub{sub_test_rewrite}5,4,3,2,1,0 DONE">;
 
 def test_sub_basic : Error<"%sub{yes_no}0">;
 // CHECK: test_sub_basic
diff --git a/clang/test/TableGen/wording-errors.td b/clang/test/TableGen/wording-errors.td
new file mode 100644
index 0000000000000..eb5eb2f547c78
--- /dev/null
+++ b/clang/test/TableGen/wording-errors.td
@@ -0,0 +1,55 @@
+// RUN: not clang-tblgen -gen-clang-diags-defs -I%S %s -o /dev/null 2>&1 | FileCheck %s
+include "DiagnosticBase.inc"
+
+// Ensure we catch a capital letter at the start of a diagnostic.
+def zero : Error<
+  "This is bad">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'This' is invalid
+
+// Test that we also correctly handle selections.
+def one : Error<
+  "%select{|or}0 That">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'That' is invalid
+def two : Error<
+  "%select{as does|}0 This">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'This' is invalid
+def three : Error<
+  "%select{and||of course}0 Whatever">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not start with a capital letter; 'Whatever' is invalid
+
+// Test that we accept the following cases.
+def four : Error<
+  "this is fine">;
+def five : Error<
+  "%select{this|is|also}0 Fine">;
+def six : Error<
+  "%select{this|is|also|}0 fine">;
+def seven : Error<
+  "%select{ARC|C|C23|C++14|OpenMP}0 are also fine">;
+
+// Next, test that we catch punctuation at the end of the diagnostic.
+def eight : Error<
+  "punctuation is bad.">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+def nine : Error<
+  "it's really bad!">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '!' is invalid
+def ten : Error<
+  "we also catch %select{punctuation.|in select}0">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+def eleven : Error<
+  "and %select{|here.}0">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+def twelve : Error<
+  "and %select{here.|}0">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+def thirteen : Error<
+  "and even %select{|here.|}0">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+def fourteen : Error<
+  "and %select{here}0.">;
+// CHECK-DAG: wording-errors.td:[[@LINE-2]]:5: error: Diagnostics should not end with punctuation; '.' is invalid
+
+// Test that we accept the following cases.
+def fifteen : Error<
+  "question marks are intentionally okay?">;
diff --git a/clang/tools/libclang/CXExtractAPI.cpp b/clang/tools/libclang/CXExtractAPI.cpp
index d74f3740406c5..c35558e66fcb9 100644
--- a/clang/tools/libclang/CXExtractAPI.cpp
+++ b/clang/tools/libclang/CXExtractAPI.cpp
@@ -45,6 +45,9 @@ struct LibClangExtractAPIVisitor
       : ExtractAPIVisitor<LibClangExtractAPIVisitor>(Context, API) {}
 
   const RawComment *fetchRawCommentForDecl(const Decl *D) const {
+    if (const auto *Comment = Base::fetchRawCommentForDecl(D))
+      return Comment;
+
     return Context.getRawCommentForAnyRedecl(D);
   }
 
diff --git a/clang/unittests/Interpreter/CodeCompletionTest.cpp b/clang/unittests/Interpreter/CodeCompletionTest.cpp
index 873fbda32f057..72c02c683fafd 100644
--- a/clang/unittests/Interpreter/CodeCompletionTest.cpp
+++ b/clang/unittests/Interpreter/CodeCompletionTest.cpp
@@ -4,6 +4,7 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
 #include "clang/Sema/Sema.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/LineEditor/LineEditor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
@@ -11,6 +12,10 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+#if defined(_AIX) || defined(__MVS__)
+#define CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+#endif
+
 using namespace clang;
 namespace {
 auto CB = clang::IncrementalCompilerBuilder();
@@ -50,7 +55,21 @@ static std::vector<std::string> runComp(clang::Interpreter &MainInterp,
   return Comps;
 }
 
+static bool HostSupportsJit() {
+  auto J = llvm::orc::LLJITBuilder().create();
+  if (J)
+    return true;
+  LLVMConsumeError(llvm::wrap(J.takeError()));
+  return false;
+}
+
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_Sanity) {
+#else
 TEST(CodeCompletionTest, Sanity) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int foo = 12;"));
   auto Err = llvm::Error::success();
@@ -61,7 +80,13 @@ TEST(CodeCompletionTest, Sanity) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_SanityNoneValid) {
+#else
 TEST(CodeCompletionTest, SanityNoneValid) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int foo = 12;"));
   auto Err = llvm::Error::success();
@@ -70,7 +95,13 @@ TEST(CodeCompletionTest, SanityNoneValid) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_TwoDecls) {
+#else
 TEST(CodeCompletionTest, TwoDecls) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int application = 12;"));
   cantFail(Interp->Parse("int apple = 12;"));
@@ -80,14 +111,26 @@ TEST(CodeCompletionTest, TwoDecls) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_CompFunDeclsNoError) {
+#else
 TEST(CodeCompletionTest, CompFunDeclsNoError) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   auto Err = llvm::Error::success();
   auto comps = runComp(*Interp, "void app(", Err);
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_TypedDirected) {
+#else
 TEST(CodeCompletionTest, TypedDirected) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int application = 12;"));
   cantFail(Interp->Parse("char apple = '2';"));
@@ -119,7 +162,13 @@ TEST(CodeCompletionTest, TypedDirected) {
   }
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_SanityClasses) {
+#else
 TEST(CodeCompletionTest, SanityClasses) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("struct Apple{};"));
   cantFail(Interp->Parse("void takeApple(Apple &a1){}"));
@@ -142,7 +191,13 @@ TEST(CodeCompletionTest, SanityClasses) {
   }
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_SubClassing) {
+#else
 TEST(CodeCompletionTest, SubClassing) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("struct Fruit {};"));
   cantFail(Interp->Parse("struct Apple : Fruit{};"));
@@ -157,7 +212,13 @@ TEST(CodeCompletionTest, SubClassing) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_MultipleArguments) {
+#else
 TEST(CodeCompletionTest, MultipleArguments) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse("int foo = 42;"));
   cantFail(Interp->Parse("char fowl = 'A';"));
@@ -169,7 +230,13 @@ TEST(CodeCompletionTest, MultipleArguments) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_Methods) {
+#else
 TEST(CodeCompletionTest, Methods) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse(
       "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};"));
@@ -183,7 +250,13 @@ TEST(CodeCompletionTest, Methods) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_MethodsInvocations) {
+#else
 TEST(CodeCompletionTest, MethodsInvocations) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse(
       "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};"));
@@ -197,7 +270,13 @@ TEST(CodeCompletionTest, MethodsInvocations) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_NestedInvocations) {
+#else
 TEST(CodeCompletionTest, NestedInvocations) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(Interp->Parse(
       "struct Foo{int add(int a){return 42;} int par(int b){return 42;}};"));
@@ -212,7 +291,13 @@ TEST(CodeCompletionTest, NestedInvocations) {
   EXPECT_EQ((bool)Err, false);
 }
 
+#ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
+TEST(CodeCompletionTest, DISABLED_TemplateFunctions) {
+#else
 TEST(CodeCompletionTest, TemplateFunctions) {
+#endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
   auto Interp = createInterpreter();
   cantFail(
       Interp->Parse("template <typename T> T id(T a) { return a;} "));
diff --git a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
index 54159173d91e3..732753f11306e 100644
--- a/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
+++ b/clang/unittests/Interpreter/IncrementalProcessingTest.cpp
@@ -36,14 +36,6 @@ using namespace clang;
 
 namespace {
 
-static bool HostSupportsJit() {
-  auto J = llvm::orc::LLJITBuilder().create();
-  if (J)
-    return true;
-  LLVMConsumeError(llvm::wrap(J.takeError()));
-  return false;
-}
-
 // Incremental processing produces several modules, all using the same "main
 // file". Make sure CodeGen can cope with that, e.g. for static initializers.
 const char TestProgram1[] = "extern \"C\" int funcForProg1() { return 17; }\n"
@@ -64,11 +56,22 @@ const Function *getGlobalInit(llvm::Module *M) {
   return nullptr;
 }
 
+static bool HostSupportsJit() {
+  auto J = llvm::orc::LLJITBuilder().create();
+  if (J)
+    return true;
+  LLVMConsumeError(llvm::wrap(J.takeError()));
+  return false;
+}
+
 #ifdef CLANG_INTERPRETER_PLATFORM_CANNOT_CREATE_LLJIT
 TEST(IncrementalProcessing, DISABLED_EmitCXXGlobalInitFunc) {
 #else
 TEST(IncrementalProcessing, EmitCXXGlobalInitFunc) {
 #endif
+  if (!HostSupportsJit())
+    GTEST_SKIP();
+
   std::vector<const char *> ClangArgv = {"-Xclang", "-emit-llvm-only"};
   auto CB = clang::IncrementalCompilerBuilder();
   CB.SetCompilerArgs(ClangArgv);
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index f564689fff7cf..b290530444d2a 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -1213,6 +1213,197 @@ static bool isRemark(const Record &Diag) {
   return ClsName == "CLASS_REMARK";
 }
 
+// Presumes the text has been split at the first whitespace or hyphen.
+static bool isExemptAtStart(StringRef Text) {
+  // Fast path, the first character is lowercase or not alphanumeric.
+  if (Text.empty() || isLower(Text[0]) || !isAlnum(Text[0]))
+    return true;
+
+  // If the text is all uppercase (or numbers, +, or _), then we assume it's an
+  // acronym and that's allowed. This covers cases like ISO, C23, C++14, and
+  // OBJECT_MODE. However, if there's only a single letter other than "C", we
+  // do not exempt it so that we catch a case like "A really bad idea" while
+  // still allowing a case like "C does not allow...".
+  if (llvm::all_of(Text, [](char C) {
+        return isUpper(C) || isDigit(C) || C == '+' || C == '_';
+      }))
+    return Text.size() > 1 || Text[0] == 'C';
+
+  // Otherwise, there are a few other exemptions.
+  return StringSwitch<bool>(Text)
+      .Case("AddressSanitizer", true)
+      .Case("CFString", true)
+      .Case("Clang", true)
+      .Case("Fuchsia", true)
+      .Case("GNUstep", true)
+      .Case("IBOutletCollection", true)
+      .Case("Microsoft", true)
+      .Case("Neon", true)
+      .StartsWith("NSInvocation", true) // NSInvocation, NSInvocation's
+      .Case("Objective", true) // Objective-C (hyphen is a word boundary)
+      .Case("OpenACC", true)
+      .Case("OpenCL", true)
+      .Case("OpenMP", true)
+      .Case("Pascal", true)
+      .Case("Swift", true)
+      .Case("Unicode", true)
+      .Case("Vulkan", true)
+      .Case("WebAssembly", true)
+      .Default(false);
+}
+
+// Does not presume the text has been split at all.
+static bool isExemptAtEnd(StringRef Text) {
+  // Rather than come up with a list of characters that are allowed, we go the
+  // other way and look only for characters that are not allowed.
+  switch (Text.back()) {
+  default:
+    return true;
+  case '?':
+    // Explicitly allowed to support "; did you mean?".
+    return true;
+  case '.':
+  case '!':
+    return false;
+  }
+}
+
+static void verifyDiagnosticWording(const Record &Diag) {
+  StringRef FullDiagText = Diag.getValueAsString("Summary");
+
+  auto DiagnoseStart = [&](StringRef Text) {
+    // Verify that the text does not start with a capital letter, except for
+    // special cases that are exempt like ISO and C++. Find the first word
+    // by looking for a word breaking character.
+    char Separators[] = {' ', '-', ',', '}'};
+    auto Iter = std::find_first_of(
+        Text.begin(), Text.end(), std::begin(Separators), std::end(Separators));
+
+    StringRef First = Text.substr(0, Iter - Text.begin());
+    if (!isExemptAtStart(First)) {
+      PrintError(&Diag,
+                 "Diagnostics should not start with a capital letter; '" +
+                     First + "' is invalid");
+    }
+  };
+
+  auto DiagnoseEnd = [&](StringRef Text) {
+    // Verify that the text does not end with punctuation like '.' or '!'.
+    if (!isExemptAtEnd(Text)) {
+      PrintError(&Diag, "Diagnostics should not end with punctuation; '" +
+                            Text.substr(Text.size() - 1, 1) + "' is invalid");
+    }
+  };
+
+  // If the diagnostic starts with %select, look through it to see whether any
+  // of the options will cause a problem.
+  if (FullDiagText.starts_with("%select{")) {
+    // Do a balanced delimiter scan from the start of the text to find the
+    // closing '}', skipping intermediary {} pairs.
+
+    size_t BraceCount = 1;
+    constexpr size_t PercentSelectBraceLen = sizeof("%select{") - 1;
+    auto Iter = FullDiagText.begin() + PercentSelectBraceLen;
+    for (auto End = FullDiagText.end(); Iter != End; ++Iter) {
+      char Ch = *Iter;
+      if (Ch == '{')
+        ++BraceCount;
+      else if (Ch == '}')
+        --BraceCount;
+      if (!BraceCount)
+        break;
+    }
+    // Defending against a malformed diagnostic string.
+    if (BraceCount != 0)
+      return;
+
+    StringRef SelectText =
+        FullDiagText.substr(PercentSelectBraceLen, Iter - FullDiagText.begin() -
+                                                       PercentSelectBraceLen);
+    SmallVector<StringRef, 4> SelectPieces;
+    SelectText.split(SelectPieces, '|');
+
+    // Walk over all of the individual pieces of select text to see if any of
+    // them start with an invalid character. If any of the select pieces is
+    // empty, we need to look at the first word after the %select to see
+    // whether that is invalid or not. If all of the pieces are fine, then we
+    // don't need to check anything else about the start of the diagnostic.
+    bool CheckSecondWord = false;
+    for (StringRef Piece : SelectPieces) {
+      if (Piece.empty())
+        CheckSecondWord = true;
+      else
+        DiagnoseStart(Piece);
+    }
+
+    if (CheckSecondWord) {
+      // There was an empty select piece, so we need to check the second
+      // word. This catches situations like '%select{|fine}0 Not okay'. Add
+      // two to account for the closing curly brace and the number after it.
+      StringRef AfterSelect =
+          FullDiagText.substr(Iter - FullDiagText.begin() + 2).ltrim();
+      DiagnoseStart(AfterSelect);
+    }
+  } else {
+    // If the start of the diagnostic is not %select, we can check the first
+    // word and be done with it.
+    DiagnoseStart(FullDiagText);
+  }
+
+  // If the last character in the diagnostic is a number preceded by a }, scan
+  // backwards to see if this is for a %select{...}0. If it is, we need to look
+  // at each piece to see whether it ends in punctuation or not.
+  bool StillNeedToDiagEnd = true;
+  if (isDigit(FullDiagText.back()) && *(FullDiagText.end() - 2) == '}') {
+    // Scan backwards to find the opening curly brace.
+    size_t BraceCount = 1;
+    auto Iter = FullDiagText.end() - sizeof("}0");
+    for (auto End = FullDiagText.begin(); Iter != End; --Iter) {
+      char Ch = *Iter;
+      if (Ch == '}')
+        ++BraceCount;
+      else if (Ch == '{')
+        --BraceCount;
+      if (!BraceCount)
+        break;
+    }
+    // Defending against a malformed diagnostic string.
+    if (BraceCount != 0)
+      return;
+
+    // Continue the backwards scan to find the word before the '{' to see if it
+    // is 'select'.
+    constexpr size_t SelectLen = sizeof("select") - 1;
+    bool IsSelect =
+        (FullDiagText.substr(Iter - SelectLen - FullDiagText.begin(),
+                             SelectLen) == "select");
+    if (IsSelect) {
+      // Gather the content between the {} for the select in question so we can
+      // split it into pieces.
+      StillNeedToDiagEnd = false; // No longer need to handle the end.
+      StringRef SelectText =
+          FullDiagText.substr(Iter - FullDiagText.begin() + /*{*/ 1,
+                              FullDiagText.end() - Iter - /*pos before }0*/ 3);
+      SmallVector<StringRef, 4> SelectPieces;
+      SelectText.split(SelectPieces, '|');
+      for (StringRef Piece : SelectPieces) {
+        // Not worrying about a situation like: "this is bar. %select{foo|}0".
+        if (!Piece.empty())
+          DiagnoseEnd(Piece);
+      }
+    }
+  }
+
+  // If we didn't already cover the diagnostic because of a %select, handle it
+  // now.
+  if (StillNeedToDiagEnd)
+    DiagnoseEnd(FullDiagText);
+
+  // FIXME: This could also be improved by looking for instances of clang or
+  // gcc in the diagnostic and recommend Clang or GCC instead. However, this
+  // runs into odd situations like [[clang::warn_unused_result]],
+  // #pragma clang, or --unwindlib=libgcc.
+}
 
 /// ClangDiagsDefsEmitter - The top-level class emits .def files containing
 /// declarations of Clang diagnostics.
@@ -1273,6 +1464,9 @@ void clang::EmitClangDiagsDefs(RecordKeeper &Records, raw_ostream &OS,
     if (!Component.empty() && Component != R.getValueAsString("Component"))
       continue;
 
+    // Validate diagnostic wording for common issues.
+    verifyDiagnosticWording(R);
+
     OS << "DIAG(" << R.getName() << ", ";
     OS << R.getValueAsDef("Class")->getName();
     OS << ", (unsigned)diag::Severity::"
diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp
index a2fc27de1901b..9375e27d4f4d2 100644
--- a/compiler-rt/lib/msan/msan.cpp
+++ b/compiler-rt/lib/msan/msan.cpp
@@ -100,7 +100,17 @@ int msan_report_count = 0;
 
 // Array of stack origins.
 // FIXME: make it resizable.
-static const uptr kNumStackOriginDescrs = 1024 * 1024;
+// Although BSS memory doesn't cost anything until used, it is limited to 2GB
+// in some configurations (e.g., "relocation R_X86_64_PC32 out of range:
+// ... is not in [-2147483648, 2147483647]; references section '.bss'").
+// We use kNumStackOriginDescrs * (sizeof(char*) + sizeof(uptr)) == 64MB.
+#ifdef SANITIZER_PPC
+// soft_rss_limit test (release_origin.c) fails on PPC if kNumStackOriginDescrs
+// is too high
+static const uptr kNumStackOriginDescrs = 1 * 1024 * 1024;
+#else
+static const uptr kNumStackOriginDescrs = 4 * 1024 * 1024;
+#endif  // SANITIZER_PPC
 static const char *StackOriginDescr[kNumStackOriginDescrs];
 static uptr StackOriginPC[kNumStackOriginDescrs];
 static atomic_uint32_t NumStackOriginDescrs;
diff --git a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
index 06a44f1885656..510ff72998914 100644
--- a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
+++ b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
@@ -101,6 +101,10 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
                         mlir::Value box,
                         mlir::ConversionPatternRewriter &rewriter) const;
 
+  mlir::Value getRankFromBox(mlir::Location loc, TypePair boxTy,
+                             mlir::Value box,
+                             mlir::ConversionPatternRewriter &rewriter) const;
+
   // Get the element type given an LLVM type that is of the form
   // (array|struct|vector)+ and the provided indexes.
   mlir::Type getBoxEleTy(mlir::Type type,
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.h b/flang/include/flang/Optimizer/HLFIR/Passes.h
index edefe36de00c1..83388d0527e19 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.h
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.h
@@ -20,10 +20,6 @@
 
 namespace hlfir {
 #define GEN_PASS_DECL
-#include "flang/Optimizer/HLFIR/Passes.h.inc"
-
-std::unique_ptr<mlir::Pass> createConvertHLFIRtoFIRPass();
-
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
 } // namespace hlfir
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index 1dd2e3dc81911..ed49f5093c965 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -12,7 +12,6 @@
 include "mlir/Pass/PassBase.td"
 def ConvertHLFIRtoFIR : Pass<"convert-hlfir-to-fir", "::mlir::ModuleOp"> {
   let summary = "Lower High-Level FIR to FIR";
-  let constructor = "hlfir::createConvertHLFIRtoFIRPass()";
   let dependentDialects = [
     "mlir::func::FuncDialect",
   ];
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index bb3c90ebc04d4..61ea7a7f9bbdd 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -331,7 +331,7 @@ inline void createHLFIRToFIRPassPipeline(
   pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
   pm.addPass(hlfir::createLowerHLFIRIntrinsics());
   pm.addPass(hlfir::createBufferizeHLFIR());
-  pm.addPass(hlfir::createConvertHLFIRtoFIRPass());
+  pm.addPass(hlfir::createConvertHLFIRtoFIR());
 }
 
 /// Create a pass pipeline for handling certain OpenMP transformations needed
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 075d0634fd1ee..8e9c1d640c330 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -41,9 +41,15 @@
 #include "flang/Optimizer/Support/Utils.h"
 #include "flang/Semantics/runtime-type-info.h"
 #include "flang/Semantics/tools.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
 
+static llvm::cl::opt<bool> allowAssumedRank(
+    "allow-assumed-rank",
+    llvm::cl::desc("Enable assumed rank lowering - experimental"),
+    llvm::cl::init(false));
+
 #define DEBUG_TYPE "flang-lower-variable"
 
 /// Helper to lower a scalar expression using a specific symbol mapping.
@@ -1885,7 +1891,8 @@ void Fortran::lower::mapSymbolAttributes(
     return;
   }
 
-  if (Fortran::evaluate::IsAssumedRank(sym))
+  const bool isAssumedRank = Fortran::evaluate::IsAssumedRank(sym);
+  if (isAssumedRank && !allowAssumedRank)
     TODO(loc, "assumed-rank variable in procedure implemented in Fortran");
 
   Fortran::lower::BoxAnalyzer ba;
@@ -1894,6 +1901,8 @@ void Fortran::lower::mapSymbolAttributes(
   // First deal with pointers and allocatables, because their handling here
   // is the same regardless of their rank.
   if (Fortran::semantics::IsAllocatableOrPointer(sym)) {
+    if (isAssumedRank)
+      TODO(loc, "assumed-rank pointer or allocatable");
     // Get address of fir.box describing the entity.
     // global
     mlir::Value boxAlloc = preAlloc;
@@ -1942,7 +1951,7 @@ void Fortran::lower::mapSymbolAttributes(
         if (mlir::Value len =
                 lowerExplicitCharLen(converter, loc, ba, symMap, stmtCtx))
           explicitParams.push_back(len);
-        if (sym.Rank() == 0) {
+        if (!isAssumedRank && sym.Rank() == 0) {
           // Do not keep scalar characters as fir.box (even when optional).
           // Lowering and FIR is not meant to deal with scalar characters as
           // fir.box outside of calls.
@@ -1987,9 +1996,11 @@ void Fortran::lower::mapSymbolAttributes(
         }
       }
       // TODO: derived type length parameters.
-      lowerExplicitLowerBounds(converter, loc, ba, lbounds, symMap, stmtCtx);
-      lowerExplicitExtents(converter, loc, ba, lbounds, explicitExtents, symMap,
-                           stmtCtx);
+      if (!isAssumedRank) {
+        lowerExplicitLowerBounds(converter, loc, ba, lbounds, symMap, stmtCtx);
+        lowerExplicitExtents(converter, loc, ba, lbounds, explicitExtents,
+                             symMap, stmtCtx);
+      }
       genBoxDeclare(converter, symMap, sym, dummyArg, lbounds, explicitParams,
                     explicitExtents, replace);
       return;
@@ -2021,6 +2032,11 @@ void Fortran::lower::mapSymbolAttributes(
     if (isUnusedEntryDummy) {
       assert(!Fortran::semantics::IsAllocatableOrPointer(sym) &&
              "handled above");
+      // Need to add support for allocatable assumed-rank to use
+      // logic below, or to simplify it and add codegen for fir.zero
+      // !fir.box<> instead.
+      if (isAssumedRank)
+        TODO(loc, "assumed rank in ENTRY");
       // The box is read right away because lowering code does not expect
       // a non pointer/allocatable symbol to be mapped to a MutableBox.
       mlir::Type ty = converter.genType(var);
@@ -2042,6 +2058,13 @@ void Fortran::lower::mapSymbolAttributes(
     return false;
   };
 
+  if (isAssumedRank) {
+    assert(isUnusedEntryDummy && "assumed rank must be pointers/allocatables "
+                                 "or descriptor dummy arguments");
+    genUnusedEntryPointBox();
+    return;
+  }
+
   // Helper to generate scalars for the symbol properties.
   auto genValue = [&](const Fortran::lower::SomeExpr &expr) {
     return genScalarValue(converter, loc, expr, symMap, stmtCtx);
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index b722e19272ca1..557a9685024c5 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -86,7 +86,7 @@ void DataSharingProcessor::insertDeallocs() {
     if (semantics::IsAllocatable(sym->GetUltimate())) {
       if (!useDelayedPrivatization) {
         converter.createHostAssociateVarCloneDealloc(*sym);
-        return;
+        continue;
       }
 
       lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 74e68725003cb..664453ebaf2f7 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -391,9 +391,8 @@ struct BoxIsArrayOpConversion : public fir::FIROpConversion<fir::BoxIsArrayOp> {
     mlir::Value a = adaptor.getOperands()[0];
     auto loc = boxisarray.getLoc();
     TypePair boxTyPair = getBoxTypePair(boxisarray.getVal().getType());
-    auto rank = getValueFromBox(loc, boxTyPair, a, rewriter.getI32Type(),
-                                rewriter, kRankPosInBox);
-    auto c0 = genConstantOffset(loc, rewriter, 0);
+    mlir::Value rank = getRankFromBox(loc, boxTyPair, a, rewriter);
+    mlir::Value c0 = genConstantIndex(loc, rank.getType(), rewriter, 0);
     rewriter.replaceOpWithNewOp<mlir::LLVM::ICmpOp>(
         boxisarray, mlir::LLVM::ICmpPredicate::ne, rank, c0);
     return mlir::success();
@@ -430,8 +429,8 @@ struct BoxRankOpConversion : public fir::FIROpConversion<fir::BoxRankOp> {
     auto loc = boxrank.getLoc();
     mlir::Type ty = convertType(boxrank.getType());
     TypePair boxTyPair = getBoxTypePair(boxrank.getVal().getType());
-    auto result =
-        getValueFromBox(loc, boxTyPair, a, ty, rewriter, kRankPosInBox);
+    mlir::Value rank = getRankFromBox(loc, boxTyPair, a, rewriter);
+    mlir::Value result = integerCast(loc, rewriter, ty, rank);
     rewriter.replaceOp(boxrank, result);
     return mlir::success();
   }
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index 69e78167b0733..8c726d547491a 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -179,6 +179,14 @@ mlir::Value ConvertFIRToLLVMPattern::getElementSizeFromBox(
   return getValueFromBox(loc, boxTy, box, resultTy, rewriter, kElemLenPosInBox);
 }
 
+/// Read base address from a fir.box. Returned address has type ty.
+mlir::Value ConvertFIRToLLVMPattern::getRankFromBox(
+    mlir::Location loc, TypePair boxTy, mlir::Value box,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resultTy = getBoxEleTy(boxTy.llvm, {kRankPosInBox});
+  return getValueFromBox(loc, boxTy, box, resultTy, rewriter, kRankPosInBox);
+}
+
 // Get the element type given an LLVM type that is of the form
 // (array|struct|vector)+ and the provided indexes.
 mlir::Type ConvertFIRToLLVMPattern::getBoxEleTy(
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index e56595d1c8e23..b48b993ddc5af 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -348,7 +348,17 @@ class DeclareOpConversion : public mlir::OpRewritePattern<hlfir::DeclareOp> {
       // Helper to generate the hlfir fir.box with the local lower bounds and
       // type parameters.
       auto genHlfirBox = [&]() -> mlir::Value {
-        if (!mlir::isa<fir::BaseBoxType>(firBase.getType())) {
+        if (auto baseBoxType =
+                mlir::dyn_cast<fir::BaseBoxType>(firBase.getType())) {
+          // Rebox so that lower bounds are correct.
+          if (baseBoxType.isAssumedRank())
+            return builder.create<fir::ReboxAssumedRankOp>(
+                loc, hlfirBaseType, firBase,
+                fir::LowerBoundModifierAttribute::SetToOnes);
+          return builder.create<fir::ReboxOp>(loc, hlfirBaseType, firBase,
+                                              declareOp.getShape(),
+                                              /*slice=*/mlir::Value{});
+        } else {
           llvm::SmallVector<mlir::Value> typeParams;
           auto maybeCharType = mlir::dyn_cast<fir::CharacterType>(
               fir::unwrapSequenceType(fir::unwrapPassByRefType(hlfirBaseType)));
@@ -358,11 +368,6 @@ class DeclareOpConversion : public mlir::OpRewritePattern<hlfir::DeclareOp> {
           return builder.create<fir::EmboxOp>(
               loc, hlfirBaseType, firBase, declareOp.getShape(),
               /*slice=*/mlir::Value{}, typeParams);
-        } else {
-          // Rebox so that lower bounds are correct.
-          return builder.create<fir::ReboxOp>(loc, hlfirBaseType, firBase,
-                                              declareOp.getShape(),
-                                              /*slice=*/mlir::Value{});
         }
       };
       if (!mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation())
@@ -789,7 +794,3 @@ class ConvertHLFIRtoFIR
 };
 
 } // namespace
-
-std::unique_ptr<mlir::Pass> hlfir::createConvertHLFIRtoFIRPass() {
-  return std::make_unique<ConvertHLFIRtoFIR>();
-}
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index f34820dd10792..0224ecfdde7c6 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -18,34 +18,34 @@ module cudadevice
   ! Synchronization Functions
 
   interface
-    attributes(device) subroutine syncthreads()
+    attributes(device) subroutine syncthreads() bind(c, name='__syncthreads')
     end subroutine
   end interface
   public :: syncthreads
 
   interface
-    attributes(device) integer function syncthreads_and(value)
+    attributes(device) integer function syncthreads_and(value) bind(c, name='__syncthreads_and')
       integer :: value
     end function
   end interface
   public :: syncthreads_and
 
   interface
-    attributes(device) integer function syncthreads_count(value)
+    attributes(device) integer function syncthreads_count(value) bind(c, name='__syncthreads_count')
       integer :: value
     end function
   end interface
   public :: syncthreads_count
 
   interface
-    attributes(device) integer function syncthreads_or(value)
+    attributes(device) integer function syncthreads_or(value) bind(c, name='__syncthreads_or')
       integer :: value
     end function
   end interface
   public :: syncthreads_or
 
   interface
-    attributes(device) subroutine syncwarp(mask)
+    attributes(device) subroutine syncwarp(mask) bind(c, name='__syncwarp')
       integer :: mask
     end subroutine
   end interface
@@ -54,19 +54,19 @@ attributes(device) subroutine syncwarp(mask)
   ! Memory Fences
 
   interface
-    attributes(device) subroutine threadfence()
+    attributes(device) subroutine threadfence() bind(c, name='__threadfence')
     end subroutine
   end interface
   public :: threadfence
 
   interface
-    attributes(device) subroutine threadfence_block()
+    attributes(device) subroutine threadfence_block() bind(c, name='__threadfence_block')
     end subroutine
   end interface
   public :: threadfence_block
 
   interface
-    attributes(device) subroutine threadfence_system()
+    attributes(device) subroutine threadfence_system() bind(c, name='__threadfence_system')
     end subroutine
   end interface
   public :: threadfence_system
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index 21323a5e657c9..70cb0443e9a64 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -941,7 +941,8 @@ func.func @extract_rank(%arg0: !fir.box<!fir.array<*:f64>>) -> i32 {
 // CHECK-LABEL: llvm.func @extract_rank(
 // CHECK-SAME:                          %[[ARG0:.*]]: !llvm.ptr) -> i32
 // CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[ARG0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})>
-// CHECK:         %[[RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i32
+// CHECK:         %[[RAW_RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i8
+// CHECK:         %[[RANK:.*]] = llvm.sext %[[RAW_RANK]] : i8 to i32
 // CHECK:         llvm.return %[[RANK]] : i32
 
 // -----
@@ -1009,9 +1010,9 @@ func.func @box_isarray(%arg0: !fir.box<!fir.array<*:f64>>) -> i1 {
 // CHECK-LABEL: llvm.func @box_isarray(
 // CHECK-SAME:                         %[[ARG0:.*]]: !llvm.ptr) -> i1
 // CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[ARG0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})>
-// CHECK:         %[[RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i32
-// CHECK:         %[[C0_ISARRAY:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:         %[[IS_ARRAY:.*]] = llvm.icmp "ne" %[[RANK]], %[[C0_ISARRAY]] : i32
+// CHECK:         %[[RANK:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> i8
+// CHECK:         %[[C0_ISARRAY:.*]] = llvm.mlir.constant(0 : i64) : i8
+// CHECK:         %[[IS_ARRAY:.*]] = llvm.icmp "ne" %[[RANK]], %[[C0_ISARRAY]] : i8
 // CHECK:         llvm.return %[[IS_ARRAY]] : i1
 
 // -----
diff --git a/flang/test/Fir/tbaa.fir b/flang/test/Fir/tbaa.fir
index 048f53f5c6e47..f4f23d35cba25 100644
--- a/flang/test/Fir/tbaa.fir
+++ b/flang/test/Fir/tbaa.fir
@@ -248,8 +248,9 @@ func.func @tbaa(%arg0: !fir.box<!fir.array<*:f64>>) -> i32 {
 // CHECK-LABEL:   llvm.func @tbaa(
 // CHECK-SAME:                    %[[VAL_0:.*]]: !llvm.ptr) -> i32 {
 // CHECK:           %[[VAL_1:.*]] = llvm.getelementptr %[[VAL_0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-// CHECK:           %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i32
-// CHECK:           llvm.return %[[VAL_2]] : i32
+// CHECK:           %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i8
+// CHECK:           %[[VAL_3:.*]] = llvm.sext %[[VAL_2]] : i8 to i32
+// CHECK:           llvm.return %[[VAL_3]] : i32
 // CHECK:         }
 
 // -----
@@ -267,9 +268,9 @@ func.func @tbaa(%arg0: !fir.box<!fir.array<*:f64>>) -> i1 {
 // CHECK-LABEL:   llvm.func @tbaa(
 // CHECK-SAME:                    %[[VAL_0:.*]]: !llvm.ptr) -> i1 {
 // CHECK:           %[[VAL_1:.*]] = llvm.getelementptr %[[VAL_0]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
-// CHECK:           %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i32
-// CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : i32) : i32
-// CHECK:           %[[VAL_4:.*]] = llvm.icmp "ne" %[[VAL_2]], %[[VAL_3]] : i32
+// CHECK:           %[[VAL_2:.*]] = llvm.load %[[VAL_1]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i8
+// CHECK:           %[[VAL_3:.*]] = llvm.mlir.constant(0 : i64) : i8
+// CHECK:           %[[VAL_4:.*]] = llvm.icmp "ne" %[[VAL_2]], %[[VAL_3]] : i8
 // CHECK:           llvm.return %[[VAL_4]] : i1
 // CHECK:         }
 
diff --git a/flang/test/HLFIR/declare-codegen.fir b/flang/test/HLFIR/declare-codegen.fir
index 9f51d0fbc7afd..bd0d61a2559db 100644
--- a/flang/test/HLFIR/declare-codegen.fir
+++ b/flang/test/HLFIR/declare-codegen.fir
@@ -210,3 +210,12 @@ func.func @dummy_scope(%arg0: !fir.ref<f32>) {
 // CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<f32>) {
 // CHECK:         %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
 // CHECK:         %[[VAL_1:.*]] = fir.declare %[[VAL_0]] dummy_scope %[[SCOPE]] {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+
+func.func @assumed_rank_declare(%arg0: !fir.box<!fir.array<*:f32>>) {
+  %0:2 = hlfir.declare %arg0 {uniq_name = "x"} : (!fir.box<!fir.array<*:f32>>) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+  return
+}
+// CHECK-LABEL:  func.func @assumed_rank_declare(
+// CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>>) {
+// CHECK:    %[[VAL_1:.*]] = fir.declare %[[VAL_0]] {uniq_name = "x"} : (!fir.box<!fir.array<*:f32>>) -> !fir.box<!fir.array<*:f32>>
+// CHECK:    %[[VAL_2:.*]] = fir.rebox_assumed_rank %[[VAL_1]] lbs ones : (!fir.box<!fir.array<*:f32>>) -> !fir.box<!fir.array<*:f32>>
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
new file mode 100644
index 0000000000000..0c71ea6efcd63
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -0,0 +1,36 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran procedures available in cudadevice module
+
+attributes(global) subroutine devsub()
+  implicit none
+  integer :: ret
+
+  call syncthreads()
+  call syncwarp(1)
+  call threadfence()
+  call threadfence_block()
+  call threadfence_system()
+  ret = syncthreads_and(1)
+  ret = syncthreads_count(1)
+  ret = syncthreads_or(1)
+end
+
+! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: fir.call @__syncthreads()
+! CHECK: fir.call @__syncwarp(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> ()
+! CHECK: fir.call @__threadfence()
+! CHECK: fir.call @__threadfence_block()
+! CHECK: fir.call @__threadfence_system()
+! CHECK: %{{.*}} = fir.call @__syncthreads_and(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> i32
+! CHECK: %{{.*}} = fir.call @__syncthreads_count(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> i32
+! CHECK: %{{.*}} = fir.call @__syncthreads_or(%{{.*}}) fastmath<contract> : (!fir.ref<i32>) -> i32
+
+! CHECK: func.func private @__syncthreads() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads"}
+! CHECK: func.func private @__syncwarp(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncwarp"}
+! CHECK: func.func private @__threadfence() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence"}
+! CHECK: func.func private @__threadfence_block() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence_block"}
+! CHECK: func.func private @__threadfence_system() attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__threadfence_system"}
+! CHECK: func.func private @__syncthreads_and(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_and"}
+! CHECK: func.func private @__syncthreads_count(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_count"}
+! CHECK: func.func private @__syncthreads_or(!fir.ref<i32> {cuf.data_attr = #cuf.cuda<device>}) -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>, fir.bindc_name = "__syncthreads_or"}
diff --git a/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90 b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90
new file mode 100644
index 0000000000000..748c15be84496
--- /dev/null
+++ b/flang/test/Lower/HLFIR/convert-variable-assumed-rank.f90
@@ -0,0 +1,70 @@
+! Test lowering of assumed-rank variables
+! RUN: bbc -emit-hlfir %s -allow-assumed-rank -o - | FileCheck %s
+
+module assumed_rank_tests
+interface
+subroutine takes_real(x)
+  real :: x(..)
+end subroutine
+subroutine takes_char(x)
+  character(*) :: x(..)
+end subroutine
+end interface
+contains
+
+subroutine test_intrinsic(x)
+  real :: x(..)
+  call takes_real(x)
+end subroutine
+
+subroutine test_character_explicit_len(x, n)
+  integer(8) :: n
+  character(n) :: x(..)
+  call takes_char(x)
+end subroutine
+
+subroutine test_character_assumed_len(x)
+  character(*) :: x(..)
+  call takes_char(x)
+end subroutine
+
+subroutine test_with_attrs(x)
+  real, target, optional :: x(..)
+  call takes_real(x)
+end subroutine
+! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_intrinsic(
+! CHECK-SAME:                                                    %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QMassumed_rank_testsFtest_intrinsicEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           fir.call @_QPtakes_real(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_character_explicit_len(
+! CHECK-SAME:                                                                 %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"},
+! CHECK-SAME:                                                                 %[[VAL_1:.*]]: !fir.ref<i64> {fir.bindc_name = "n"}) {
+! CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEn"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i64>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_6:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_5]] : i64
+! CHECK:           %[[VAL_7:.*]] = arith.select %[[VAL_6]], %[[VAL_4]], %[[VAL_5]] : i64
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_7]] dummy_scope %[[VAL_2]] {uniq_name = "_QMassumed_rank_testsFtest_character_explicit_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, i64, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
+! CHECK:           fir.call @_QPtakes_char(%[[VAL_8]]#0) fastmath<contract> : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> ()
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_character_assumed_len(
+! CHECK-SAME:                                                                %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QMassumed_rank_testsFtest_character_assumed_lenEx"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>)
+! CHECK:           fir.call @_QPtakes_char(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> ()
+! CHECK:           return
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QMassumed_rank_testsPtest_with_attrs(
+! CHECK-SAME:                                                     %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.optional, fir.target}) {
+! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional, target>, uniq_name = "_QMassumed_rank_testsFtest_with_attrsEx"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>)
+! CHECK:           fir.call @_QPtakes_real(%[[VAL_2]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
+end module
diff --git a/flang/test/Lower/OpenMP/allocatable-multiple-vars.f90 b/flang/test/Lower/OpenMP/allocatable-multiple-vars.f90
new file mode 100644
index 0000000000000..e6450a13e13a0
--- /dev/null
+++ b/flang/test/Lower/OpenMP/allocatable-multiple-vars.f90
@@ -0,0 +1,28 @@
+! Test early privatization for multiple allocatable variables.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization=false \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization=false -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+subroutine delayed_privatization_allocatable
+  implicit none
+  integer, allocatable :: var1, var2
+
+!$omp parallel private(var1, var2)
+  var1 = 10
+  var2 = 20
+!$omp end parallel
+end subroutine
+
+! Verify that private versions of each variable are both allocated and freed
+! within the parallel region.
+
+! CHECK:      omp.parallel {
+! CHECK:        fir.allocmem
+! CHECK:        fir.allocmem
+! CHECK:        fir.freemem
+! CHECK:        fir.freemem
+! CHECK:        omp.terminator
+! CHECK-NEXT: }
diff --git a/flang/test/Lower/PowerPC/ppc-vec-load.f90 b/flang/test/Lower/PowerPC/ppc-vec-load.f90
index 4d51512df0f7b..a81ed055ce08c 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-load.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-load.f90
@@ -1,12 +1,13 @@
-! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s
-! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s
+! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE","LLVM" %s
+! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -target-cpu pwr9 -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR_P9","LLVM" %s
+! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE","LLVM" %s
 ! REQUIRES: target=powerpc{{.*}}
 
 !----------------------
 ! vec_ld
 !----------------------
 
-! CHECK-LABEL: @vec_ld_testi8
+! LLVM-LABEL: @vec_ld_testi8
 subroutine vec_ld_testi8(arg1, arg2, res)
   integer(1) :: arg1
   vector(integer(1)) :: arg2, res
@@ -19,7 +20,7 @@ subroutine vec_ld_testi8(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[bc]], ptr %2, align 16
 end subroutine vec_ld_testi8
 
-! CHECK-LABEL: @vec_ld_testi16
+! LLVM-LABEL: @vec_ld_testi16
 subroutine vec_ld_testi16(arg1, arg2, res)
   integer(2) :: arg1
   vector(integer(2)) :: arg2, res
@@ -32,7 +33,7 @@ subroutine vec_ld_testi16(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[bc]], ptr %2, align 16
 end subroutine vec_ld_testi16
 
-! CHECK-LABEL: @vec_ld_testi32
+! LLVM-LABEL: @vec_ld_testi32
 subroutine vec_ld_testi32(arg1, arg2, res)
   integer(4) :: arg1
   vector(integer(4)) :: arg2, res
@@ -44,7 +45,7 @@ subroutine vec_ld_testi32(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[bc]], ptr %2, align 16
 end subroutine vec_ld_testi32
 
-! CHECK-LABEL: @vec_ld_testf32
+! LLVM-LABEL: @vec_ld_testf32
 subroutine vec_ld_testf32(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(4)) :: arg2, res
@@ -58,7 +59,7 @@ subroutine vec_ld_testf32(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_ld_testf32
 
-! CHECK-LABEL: @vec_ld_testu32
+! LLVM-LABEL: @vec_ld_testu32
 subroutine vec_ld_testu32(arg1, arg2, res)
   integer(1) :: arg1
   vector(unsigned(4)) :: arg2, res
@@ -70,7 +71,7 @@ subroutine vec_ld_testu32(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16
 end subroutine vec_ld_testu32
 
-! CHECK-LABEL: @vec_ld_testi32a
+! LLVM-LABEL: @vec_ld_testi32a
 subroutine vec_ld_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(10)
@@ -83,7 +84,7 @@ subroutine vec_ld_testi32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16
 end subroutine vec_ld_testi32a
 
-! CHECK-LABEL: @vec_ld_testf32av
+! LLVM-LABEL: @vec_ld_testf32av
 subroutine vec_ld_testf32av(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(4)) :: arg2(2, 4, 8)
@@ -98,7 +99,7 @@ subroutine vec_ld_testf32av(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_ld_testf32av
 
-! CHECK-LABEL: @vec_ld_testi32s
+! LLVM-LABEL: @vec_ld_testi32s
 subroutine vec_ld_testi32s(arg1, arg2, res)
   integer(4) :: arg1
   real(4) :: arg2
@@ -116,7 +117,7 @@ end subroutine vec_ld_testi32s
 ! vec_lde
 !----------------------
 
-! CHECK-LABEL: @vec_lde_testi8s
+! LLVM-LABEL: @vec_lde_testi8s
 subroutine vec_lde_testi8s(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2
@@ -129,7 +130,7 @@ subroutine vec_lde_testi8s(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[call]], ptr %2, align 16
 end subroutine vec_lde_testi8s
 
-! CHECK-LABEL: @vec_lde_testi16a
+! LLVM-LABEL: @vec_lde_testi16a
 subroutine vec_lde_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(2, 4, 8)
@@ -142,7 +143,7 @@ subroutine vec_lde_testi16a(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[call]], ptr %2, align 16
 end subroutine vec_lde_testi16a
 
-! CHECK-LABEL: @vec_lde_testi32a
+! LLVM-LABEL: @vec_lde_testi32a
 subroutine vec_lde_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(4)
@@ -155,7 +156,7 @@ subroutine vec_lde_testi32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16
 end subroutine vec_lde_testi32a
 
-! CHECK-LABEL: @vec_lde_testf32a
+! LLVM-LABEL: @vec_lde_testf32a
 subroutine vec_lde_testf32a(arg1, arg2, res)
   integer(8) :: arg1
   real(4) :: arg2(4)
@@ -173,7 +174,7 @@ end subroutine vec_lde_testf32a
 ! vec_ldl
 !----------------------
 
-! CHECK-LABEL: @vec_ldl_testi8
+! LLVM-LABEL: @vec_ldl_testi8
 subroutine vec_ldl_testi8(arg1, arg2, res)
   integer(1) :: arg1
   vector(integer(1)) :: arg2, res
@@ -186,7 +187,7 @@ subroutine vec_ldl_testi8(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[bc]], ptr %2, align 16
 end subroutine vec_ldl_testi8
 
-! CHECK-LABEL: @vec_ldl_testi16
+! LLVM-LABEL: @vec_ldl_testi16
 subroutine vec_ldl_testi16(arg1, arg2, res)
   integer(2) :: arg1
   vector(integer(2)) :: arg2, res
@@ -199,7 +200,7 @@ subroutine vec_ldl_testi16(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[bc]], ptr %2, align 16
 end subroutine vec_ldl_testi16
 
-! CHECK-LABEL: @vec_ldl_testi32
+! LLVM-LABEL: @vec_ldl_testi32
 subroutine vec_ldl_testi32(arg1, arg2, res)
   integer(4) :: arg1
   vector(integer(4)) :: arg2, res
@@ -211,7 +212,7 @@ subroutine vec_ldl_testi32(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[bc]], ptr %2, align 16
 end subroutine vec_ldl_testi32
 
-! CHECK-LABEL: @vec_ldl_testf32
+! LLVM-LABEL: @vec_ldl_testf32
 subroutine vec_ldl_testf32(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(4)) :: arg2, res
@@ -225,7 +226,7 @@ subroutine vec_ldl_testf32(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_ldl_testf32
 
-! CHECK-LABEL: @vec_ldl_testu32
+! LLVM-LABEL: @vec_ldl_testu32
 subroutine vec_ldl_testu32(arg1, arg2, res)
   integer(1) :: arg1
   vector(unsigned(4)) :: arg2, res
@@ -237,7 +238,7 @@ subroutine vec_ldl_testu32(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16
 end subroutine vec_ldl_testu32
 
-! CHECK-LABEL: @vec_ldl_testi32a
+! LLVM-LABEL: @vec_ldl_testi32a
 subroutine vec_ldl_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(10)
@@ -250,7 +251,7 @@ subroutine vec_ldl_testi32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[call]], ptr %2, align 16
 end subroutine vec_ldl_testi32a
 
-! CHECK-LABEL: @vec_ldl_testf32av
+! LLVM-LABEL: @vec_ldl_testf32av
 subroutine vec_ldl_testf32av(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(4)) :: arg2(2, 4, 8)
@@ -264,7 +265,7 @@ subroutine vec_ldl_testf32av(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_ldl_testf32av
 
-! CHECK-LABEL: @vec_ldl_testi32s
+! LLVM-LABEL: @vec_ldl_testi32s
 subroutine vec_ldl_testi32s(arg1, arg2, res)
   integer(4) :: arg1
   real(4) :: arg2
@@ -282,7 +283,7 @@ end subroutine vec_ldl_testi32s
 ! vec_lvsl
 !----------------------
 
-! CHECK-LABEL: @vec_lvsl_testi8s
+! LLVM-LABEL: @vec_lvsl_testi8s
 subroutine vec_lvsl_testi8s(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2
@@ -300,7 +301,7 @@ subroutine vec_lvsl_testi8s(arg1, arg2, res)
 ! LLVMIR-BE: store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_lvsl_testi8s
 
-! CHECK-LABEL: @vec_lvsl_testi16a
+! LLVM-LABEL: @vec_lvsl_testi16a
 subroutine vec_lvsl_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(4)
@@ -318,7 +319,7 @@ subroutine vec_lvsl_testi16a(arg1, arg2, res)
 ! LLVMIR-BE:  store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_lvsl_testi16a
 
-! CHECK-LABEL: @vec_lvsl_testi32a
+! LLVM-LABEL: @vec_lvsl_testi32a
 subroutine vec_lvsl_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(2, 3, 4)
@@ -336,7 +337,7 @@ subroutine vec_lvsl_testi32a(arg1, arg2, res)
 ! LLVMIR-BE:  store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_lvsl_testi32a
 
-! CHECK-LABEL: @vec_lvsl_testf32a
+! LLVM-LABEL: @vec_lvsl_testf32a
 subroutine vec_lvsl_testf32a(arg1, arg2, res)
   integer(8) :: arg1
   real(4) :: arg2(4)
@@ -357,7 +358,7 @@ end subroutine vec_lvsl_testf32a
 ! vec_lvsr
 !----------------------
 
-! CHECK-LABEL: @vec_lvsr_testi8s
+! LLVM-LABEL: @vec_lvsr_testi8s
 subroutine vec_lvsr_testi8s(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2
@@ -375,7 +376,7 @@ subroutine vec_lvsr_testi8s(arg1, arg2, res)
 ! LLVMIR-BE: store <16 x i8> %[[addr]], ptr %2, align 16
 end subroutine vec_lvsr_testi8s
 
-! CHECK-LABEL: @vec_lvsr_testi16a
+! LLVM-LABEL: @vec_lvsr_testi16a
 subroutine vec_lvsr_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(4)
@@ -393,7 +394,7 @@ subroutine vec_lvsr_testi16a(arg1, arg2, res)
 ! LLVMIR-BE: store <16 x i8> %[[addr]], ptr %2, align 16
 end subroutine vec_lvsr_testi16a
 
-! CHECK-LABEL: @vec_lvsr_testi32a
+! LLVM-LABEL: @vec_lvsr_testi32a
 subroutine vec_lvsr_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(2, 3, 4)
@@ -411,7 +412,7 @@ subroutine vec_lvsr_testi32a(arg1, arg2, res)
 ! LLVMIR-BE: store <16 x i8> %[[addr]], ptr %2, align 16
 end subroutine vec_lvsr_testi32a
 
-! CHECK-LABEL: @vec_lvsr_testf32a
+! LLVM-LABEL: @vec_lvsr_testf32a
 subroutine vec_lvsr_testf32a(arg1, arg2, res)
   integer(8) :: arg1
   real(4) :: arg2(4)
@@ -432,7 +433,7 @@ end subroutine vec_lvsr_testf32a
 ! vec_lxv
 !----------------------
 
-! CHECK-LABEL: @vec_lxv_testi8a
+! LLVM-LABEL: @vec_lxv_testi8a
 subroutine vec_lxv_testi8a(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2(4)
@@ -445,7 +446,7 @@ subroutine vec_lxv_testi8a(arg1, arg2, res)
 ! LLVMIR_P9: store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_lxv_testi8a
 
-! CHECK-LABEL: @vec_lxv_testi16a
+! LLVM-LABEL: @vec_lxv_testi16a
 subroutine vec_lxv_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(2, 4, 8)
@@ -458,7 +459,7 @@ subroutine vec_lxv_testi16a(arg1, arg2, res)
 ! LLVMIR_P9: store <8 x i16> %[[ld]], ptr %2, align 16
 end subroutine vec_lxv_testi16a
 
-! CHECK-LABEL: @vec_lxv_testi32a
+! LLVM-LABEL: @vec_lxv_testi32a
 subroutine vec_lxv_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(2, 4, 8)
@@ -471,7 +472,7 @@ subroutine vec_lxv_testi32a(arg1, arg2, res)
 ! LLVMIR_P9: store <4 x i32> %[[ld]], ptr %2, align 16
 end subroutine vec_lxv_testi32a
 
-! CHECK-LABEL: @vec_lxv_testf32a
+! LLVM-LABEL: @vec_lxv_testf32a
 subroutine vec_lxv_testf32a(arg1, arg2, res)
   integer(2) :: arg1
   real(4) :: arg2(4)
@@ -484,7 +485,7 @@ subroutine vec_lxv_testf32a(arg1, arg2, res)
 ! LLVMIR_P9: store <4 x float> %[[ld]], ptr %2, align 16
 end subroutine vec_lxv_testf32a
 
-! CHECK-LABEL: @vec_lxv_testf64a
+! LLVM-LABEL: @vec_lxv_testf64a
 subroutine vec_lxv_testf64a(arg1, arg2, res)
   integer(8) :: arg1
   real(8) :: arg2(4)
@@ -501,7 +502,7 @@ end subroutine vec_lxv_testf64a
 ! vec_xld2
 !----------------------
 
-! CHECK-LABEL: @vec_xld2_testi8a
+! LLVM-LABEL: @vec_xld2_testi8a
 subroutine vec_xld2_testi8a(arg1, arg2, res)
   integer(1) :: arg1
   vector(integer(1)) :: arg2(4)
@@ -515,7 +516,7 @@ subroutine vec_xld2_testi8a(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[bc]], ptr %2, align 16
 end subroutine vec_xld2_testi8a
 
-! CHECK-LABEL: @vec_xld2_testi16
+! LLVM-LABEL: @vec_xld2_testi16
 subroutine vec_xld2_testi16(arg1, arg2, res)
   integer :: arg1
   vector(integer(2)) :: arg2
@@ -529,7 +530,7 @@ subroutine vec_xld2_testi16(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[bc]], ptr %2, align 16
 end subroutine vec_xld2_testi16
 
-! CHECK-LABEL: @vec_xld2_testi32a
+! LLVM-LABEL: @vec_xld2_testi32a
 subroutine vec_xld2_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   vector(integer(4)) :: arg2(41)
@@ -543,7 +544,7 @@ subroutine vec_xld2_testi32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[bc]], ptr %2, align 16
 end subroutine vec_xld2_testi32a
 
-! CHECK-LABEL: @vec_xld2_testi64a
+! LLVM-LABEL: @vec_xld2_testi64a
 subroutine vec_xld2_testi64a(arg1, arg2, res)
   integer(8) :: arg1
   vector(integer(8)) :: arg2(4)
@@ -557,7 +558,7 @@ subroutine vec_xld2_testi64a(arg1, arg2, res)
 ! LLVMIR: store <2 x i64> %[[bc]], ptr %2, align 16
 end subroutine vec_xld2_testi64a
 
-! CHECK-LABEL: @vec_xld2_testf32a
+! LLVM-LABEL: @vec_xld2_testf32a
 subroutine vec_xld2_testf32a(arg1, arg2, res)
   integer(2) :: arg1
   vector(real(4)) :: arg2(4)
@@ -571,7 +572,7 @@ subroutine vec_xld2_testf32a(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_xld2_testf32a
 
-! CHECK-LABEL: @vec_xld2_testf64a
+! LLVM-LABEL: @vec_xld2_testf64a
 subroutine vec_xld2_testf64a(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(8)) :: arg2(4)
@@ -588,7 +589,7 @@ end subroutine vec_xld2_testf64a
 ! vec_xl
 !----------------------
 
-! CHECK-LABEL: @vec_xl_testi8a
+! LLVM-LABEL: @vec_xl_testi8a
 subroutine vec_xl_testi8a(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2(4)
@@ -601,7 +602,7 @@ subroutine vec_xl_testi8a(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_testi8a
 
-! CHECK-LABEL: @vec_xl_testi16a
+! LLVM-LABEL: @vec_xl_testi16a
 subroutine vec_xl_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(2, 4, 8)
@@ -614,7 +615,7 @@ subroutine vec_xl_testi16a(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_testi16a
 
-! CHECK-LABEL: @vec_xl_testi32a
+! LLVM-LABEL: @vec_xl_testi32a
 subroutine vec_xl_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(2, 4, 8)
@@ -627,7 +628,7 @@ subroutine vec_xl_testi32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_testi32a
 
-! CHECK-LABEL: @vec_xl_testi64a
+! LLVM-LABEL: @vec_xl_testi64a
 subroutine vec_xl_testi64a(arg1, arg2, res)
   integer(8) :: arg1
   integer(8) :: arg2(2, 4, 8)
@@ -641,7 +642,7 @@ subroutine vec_xl_testi64a(arg1, arg2, res)
 ! LLVMIR: store <2 x i64> %[[bc]], ptr %2, align 16
 end subroutine vec_xl_testi64a
 
-! CHECK-LABEL: @vec_xl_testf32a
+! LLVM-LABEL: @vec_xl_testf32a
 subroutine vec_xl_testf32a(arg1, arg2, res)
   integer(2) :: arg1
   real(4) :: arg2(4)
@@ -655,7 +656,7 @@ subroutine vec_xl_testf32a(arg1, arg2, res)
 ! LLVMIR: store <4 x float> %[[bc]], ptr %2, align 16
 end subroutine vec_xl_testf32a
 
-! CHECK-LABEL: @vec_xl_testf64a
+! LLVM-LABEL: @vec_xl_testf64a
 subroutine vec_xl_testf64a(arg1, arg2, res)
   integer(8) :: arg1
   real(8) :: arg2
@@ -672,7 +673,7 @@ end subroutine vec_xl_testf64a
 ! vec_xlds
 !----------------------
 
-! CHECK-LABEL: @vec_xlds_testi64a
+! LLVM-LABEL: @vec_xlds_testi64a
 subroutine vec_xlds_testi64a(arg1, arg2, res)
   integer(8) :: arg1
   vector(integer(8)) :: arg2(4)
@@ -687,7 +688,7 @@ subroutine vec_xlds_testi64a(arg1, arg2, res)
 ! LLVMIR: store <2 x i64> %[[shfl]], ptr %2, align 16
 end subroutine vec_xlds_testi64a
 
-! CHECK-LABEL: @vec_xlds_testf64a
+! LLVM-LABEL: @vec_xlds_testf64a
 subroutine vec_xlds_testf64a(arg1, arg2, res)
   integer(8) :: arg1
   vector(real(8)) :: arg2(4)
@@ -707,7 +708,7 @@ end subroutine vec_xlds_testf64a
 ! vec_xl_be
 !----------------------
 
-! CHECK-LABEL: @vec_xl_be_testi8a
+! LLVM-LABEL: @vec_xl_be_testi8a
 subroutine vec_xl_be_testi8a(arg1, arg2, res)
   integer(1) :: arg1
   integer(1) :: arg2(2, 4, 8)
@@ -722,7 +723,7 @@ subroutine vec_xl_be_testi8a(arg1, arg2, res)
 ! LLVMIR-BE: store <16 x i8> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_be_testi8a
 
-! CHECK-LABEL: @vec_xl_be_testi16a
+! LLVM-LABEL: @vec_xl_be_testi16a
 subroutine vec_xl_be_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   integer(2) :: arg2(2, 4, 8)
@@ -737,7 +738,7 @@ subroutine vec_xl_be_testi16a(arg1, arg2, res)
 ! LLVMIR-BE: store <8 x i16> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_be_testi16a
 
-! CHECK-LABEL: @vec_xl_be_testi32a
+! LLVM-LABEL: @vec_xl_be_testi32a
 subroutine vec_xl_be_testi32a(arg1, arg2, res)
   integer(4) :: arg1
   integer(4) :: arg2(2, 4, 8)
@@ -752,7 +753,7 @@ subroutine vec_xl_be_testi32a(arg1, arg2, res)
 ! LLVMIR-BE:  store <4 x i32> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_be_testi32a
 
-! CHECK-LABEL: @vec_xl_be_testi64a
+! LLVM-LABEL: @vec_xl_be_testi64a
 subroutine vec_xl_be_testi64a(arg1, arg2, res)
   integer(8) :: arg1
   integer(8) :: arg2(2, 4, 8)
@@ -767,7 +768,7 @@ subroutine vec_xl_be_testi64a(arg1, arg2, res)
 ! LLVMIR-BE:  store <2 x i64> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_be_testi64a
 
-! CHECK-LABEL: @vec_xl_be_testf32a
+! LLVM-LABEL: @vec_xl_be_testf32a
 subroutine vec_xl_be_testf32a(arg1, arg2, res)
   integer(2) :: arg1
   real(4) :: arg2(4)
@@ -782,7 +783,7 @@ subroutine vec_xl_be_testf32a(arg1, arg2, res)
 ! LLVMIR-BE:  store <4 x float> %[[ld]], ptr %2, align 16
 end subroutine vec_xl_be_testf32a
 
-! CHECK-LABEL: @vec_xl_be_testf64a
+! LLVM-LABEL: @vec_xl_be_testf64a
 subroutine vec_xl_be_testf64a(arg1, arg2, res)
   integer(8) :: arg1
   real(8) :: arg2(7)
@@ -801,7 +802,7 @@ end subroutine vec_xl_be_testf64a
 ! vec_xlw4
 !----------------------
 
-! CHECK-LABEL: @vec_xlw4_testi8a
+! LLVM-LABEL: @vec_xlw4_testi8a
 subroutine vec_xlw4_testi8a(arg1, arg2, res)
   integer(1) :: arg1
   vector(integer(1)) :: arg2(2, 4, 8)
@@ -815,7 +816,7 @@ subroutine vec_xlw4_testi8a(arg1, arg2, res)
 ! LLVMIR: store <16 x i8> %[[res]], ptr %2, align 16
 end subroutine vec_xlw4_testi8a
 
-! CHECK-LABEL: @vec_xlw4_testi16a
+! LLVM-LABEL: @vec_xlw4_testi16a
 subroutine vec_xlw4_testi16a(arg1, arg2, res)
   integer(2) :: arg1
   vector(integer(2)) :: arg2(2, 4, 8)
@@ -829,7 +830,7 @@ subroutine vec_xlw4_testi16a(arg1, arg2, res)
 ! LLVMIR: store <8 x i16> %[[res]], ptr %2, align 16
 end subroutine vec_xlw4_testi16a
 
-! CHECK-LABEL: @vec_xlw4_testu32a
+! LLVM-LABEL: @vec_xlw4_testu32a
 subroutine vec_xlw4_testu32a(arg1, arg2, res)
   integer(4) :: arg1
   vector(unsigned(4)) :: arg2(2, 4, 8)
@@ -842,7 +843,7 @@ subroutine vec_xlw4_testu32a(arg1, arg2, res)
 ! LLVMIR: store <4 x i32> %[[ld]], ptr %2, align 16
 end subroutine vec_xlw4_testu32a
 
-! CHECK-LABEL: @vec_xlw4_testf32a
+! LLVM-LABEL: @vec_xlw4_testf32a
 subroutine vec_xlw4_testf32a(arg1, arg2, res)
   integer(2) :: arg1
   vector(real(4)) :: arg2(4)
diff --git a/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90 b/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90
index bd83f28b4eeb5..6c4f202f89a45 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-shift-be-le.f90
@@ -1,13 +1,13 @@
-! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="CHECK" %s
+! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR","LLVM" %s
 !
-! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64-unknown-aix -o - | FileCheck --check-prefixes="BE-LLVMIR" %s
+! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -triple ppc64-unknown-aix -o - | FileCheck --check-prefixes="BE-LLVMIR","LLVM" %s
 ! REQUIRES: target=powerpc{{.*}}
 
 !----------------------
 ! vec_sld
 !----------------------
 
-! CHECK-LABEL: vec_sld_test_i1i1
+! LLVM-LABEL: vec_sld_test_i1i1
 subroutine vec_sld_test_i1i1(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -23,7 +23,7 @@ subroutine vec_sld_test_i1i1(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i1i1
 
-! CHECK-LABEL: vec_sld_test_i1i2
+! LLVM-LABEL: vec_sld_test_i1i2
 subroutine vec_sld_test_i1i2(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -39,7 +39,7 @@ subroutine vec_sld_test_i1i2(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i1i2
 
-! CHECK-LABEL: vec_sld_test_i1i4
+! LLVM-LABEL: vec_sld_test_i1i4
 subroutine vec_sld_test_i1i4(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -55,7 +55,7 @@ subroutine vec_sld_test_i1i4(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i1i4
 
-! CHECK-LABEL: vec_sld_test_i1i8
+! LLVM-LABEL: vec_sld_test_i1i8
 subroutine vec_sld_test_i1i8(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_8)
@@ -71,7 +71,7 @@ subroutine vec_sld_test_i1i8(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i1i8
 
-! CHECK-LABEL: vec_sld_test_i2i1
+! LLVM-LABEL: vec_sld_test_i2i1
 subroutine vec_sld_test_i2i1(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -93,7 +93,7 @@ subroutine vec_sld_test_i2i1(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i2i1
 
-! CHECK-LABEL: vec_sld_test_i2i2
+! LLVM-LABEL: vec_sld_test_i2i2
 subroutine vec_sld_test_i2i2(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 8_2)
@@ -115,7 +115,7 @@ subroutine vec_sld_test_i2i2(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i2i2
 
-! CHECK-LABEL: vec_sld_test_i2i4
+! LLVM-LABEL: vec_sld_test_i2i4
 subroutine vec_sld_test_i2i4(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -137,7 +137,7 @@ subroutine vec_sld_test_i2i4(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i2i4
 
-! CHECK-LABEL: vec_sld_test_i2i8
+! LLVM-LABEL: vec_sld_test_i2i8
 subroutine vec_sld_test_i2i8(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 11_8)
@@ -159,7 +159,7 @@ subroutine vec_sld_test_i2i8(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i2i8
 
-! CHECK-LABEL: vec_sld_test_i4i1
+! LLVM-LABEL: vec_sld_test_i4i1
 subroutine vec_sld_test_i4i1(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -181,7 +181,7 @@ subroutine vec_sld_test_i4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i4i1
 
-! CHECK-LABEL: vec_sld_test_i4i2
+! LLVM-LABEL: vec_sld_test_i4i2
 subroutine vec_sld_test_i4i2(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -203,7 +203,7 @@ subroutine vec_sld_test_i4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i4i2
 
-! CHECK-LABEL: vec_sld_test_i4i4
+! LLVM-LABEL: vec_sld_test_i4i4
 subroutine vec_sld_test_i4i4(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -225,7 +225,7 @@ subroutine vec_sld_test_i4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i4i4
 
-! CHECK-LABEL: vec_sld_test_i4i8
+! LLVM-LABEL: vec_sld_test_i4i8
 subroutine vec_sld_test_i4i8(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_8)
@@ -247,7 +247,7 @@ subroutine vec_sld_test_i4i8(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_i4i8
 
-! CHECK-LABEL: vec_sld_test_u1i1
+! LLVM-LABEL: vec_sld_test_u1i1
 subroutine vec_sld_test_u1i1(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -263,7 +263,7 @@ subroutine vec_sld_test_u1i1(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u1i1
 
-! CHECK-LABEL: vec_sld_test_u1i2
+! LLVM-LABEL: vec_sld_test_u1i2
 subroutine vec_sld_test_u1i2(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -279,7 +279,7 @@ subroutine vec_sld_test_u1i2(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u1i2
 
-! CHECK-LABEL: vec_sld_test_u1i4
+! LLVM-LABEL: vec_sld_test_u1i4
 subroutine vec_sld_test_u1i4(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -295,7 +295,7 @@ subroutine vec_sld_test_u1i4(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u1i4
 
-! CHECK-LABEL: vec_sld_test_u1i8
+! LLVM-LABEL: vec_sld_test_u1i8
 subroutine vec_sld_test_u1i8(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -311,7 +311,7 @@ subroutine vec_sld_test_u1i8(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u1i8
 
-! CHECK-LABEL: vec_sld_test_u2i1
+! LLVM-LABEL: vec_sld_test_u2i1
 subroutine vec_sld_test_u2i1(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -333,7 +333,7 @@ subroutine vec_sld_test_u2i1(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u2i1
 
-! CHECK-LABEL: vec_sld_test_u2i2
+! LLVM-LABEL: vec_sld_test_u2i2
 subroutine vec_sld_test_u2i2(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -355,7 +355,7 @@ subroutine vec_sld_test_u2i2(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u2i2
 
-! CHECK-LABEL: vec_sld_test_u2i4
+! LLVM-LABEL: vec_sld_test_u2i4
 subroutine vec_sld_test_u2i4(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -377,7 +377,7 @@ subroutine vec_sld_test_u2i4(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u2i4
 
-! CHECK-LABEL: vec_sld_test_u2i8
+! LLVM-LABEL: vec_sld_test_u2i8
 subroutine vec_sld_test_u2i8(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_8)
@@ -399,7 +399,7 @@ subroutine vec_sld_test_u2i8(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u2i8
 
-! CHECK-LABEL: vec_sld_test_u4i1
+! LLVM-LABEL: vec_sld_test_u4i1
 subroutine vec_sld_test_u4i1(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -421,7 +421,7 @@ subroutine vec_sld_test_u4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u4i1
 
-! CHECK-LABEL: vec_sld_test_u4i2
+! LLVM-LABEL: vec_sld_test_u4i2
 subroutine vec_sld_test_u4i2(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -443,7 +443,7 @@ subroutine vec_sld_test_u4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u4i2
 
-! CHECK-LABEL: vec_sld_test_u4i4
+! LLVM-LABEL: vec_sld_test_u4i4
 subroutine vec_sld_test_u4i4(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -465,7 +465,7 @@ subroutine vec_sld_test_u4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u4i4
 
-! CHECK-LABEL: vec_sld_test_u4i8
+! LLVM-LABEL: vec_sld_test_u4i8
 subroutine vec_sld_test_u4i8(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_8)
@@ -487,7 +487,7 @@ subroutine vec_sld_test_u4i8(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_u4i8
 
-! CHECK-LABEL: vec_sld_test_r4i1
+! LLVM-LABEL: vec_sld_test_r4i1
 subroutine vec_sld_test_r4i1(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_1)
@@ -509,7 +509,7 @@ subroutine vec_sld_test_r4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_r4i1
 
-! CHECK-LABEL: vec_sld_test_r4i2
+! LLVM-LABEL: vec_sld_test_r4i2
 subroutine vec_sld_test_r4i2(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_2)
@@ -531,7 +531,7 @@ subroutine vec_sld_test_r4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_r4i2
 
-! CHECK-LABEL: vec_sld_test_r4i4
+! LLVM-LABEL: vec_sld_test_r4i4
 subroutine vec_sld_test_r4i4(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 3_4)
@@ -553,7 +553,7 @@ subroutine vec_sld_test_r4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sld_test_r4i4
 
-! CHECK-LABEL: vec_sld_test_r4i8
+! LLVM-LABEL: vec_sld_test_r4i8
 subroutine vec_sld_test_r4i8(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sld(arg1, arg2, 1_8)
@@ -578,7 +578,7 @@ end subroutine vec_sld_test_r4i8
 !----------------------
 ! vec_sldw
 !----------------------
-! CHECK-LABEL: vec_sldw_test_i1i1
+! LLVM-LABEL: vec_sldw_test_i1i1
 subroutine vec_sldw_test_i1i1(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -594,7 +594,7 @@ subroutine vec_sldw_test_i1i1(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i1i1
 
-! CHECK-LABEL: vec_sldw_test_i1i2
+! LLVM-LABEL: vec_sldw_test_i1i2
 subroutine vec_sldw_test_i1i2(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -610,7 +610,7 @@ subroutine vec_sldw_test_i1i2(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i1i2
 
-! CHECK-LABEL: vec_sldw_test_i1i4
+! LLVM-LABEL: vec_sldw_test_i1i4
 subroutine vec_sldw_test_i1i4(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -626,7 +626,7 @@ subroutine vec_sldw_test_i1i4(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i1i4
 
-! CHECK-LABEL: vec_sldw_test_i1i8
+! LLVM-LABEL: vec_sldw_test_i1i8
 subroutine vec_sldw_test_i1i8(arg1, arg2)
   vector(integer(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -642,7 +642,7 @@ subroutine vec_sldw_test_i1i8(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i1i8
 
-! CHECK-LABEL: vec_sldw_test_i2i1
+! LLVM-LABEL: vec_sldw_test_i2i1
 subroutine vec_sldw_test_i2i1(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -664,7 +664,7 @@ subroutine vec_sldw_test_i2i1(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i2i1
 
-! CHECK-LABEL: vec_sldw_test_i2i2
+! LLVM-LABEL: vec_sldw_test_i2i2
 subroutine vec_sldw_test_i2i2(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -686,7 +686,7 @@ subroutine vec_sldw_test_i2i2(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i2i2
 
-! CHECK-LABEL: vec_sldw_test_i2i4
+! LLVM-LABEL: vec_sldw_test_i2i4
 subroutine vec_sldw_test_i2i4(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -708,7 +708,7 @@ subroutine vec_sldw_test_i2i4(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i2i4
 
-! CHECK-LABEL: vec_sldw_test_i2i8
+! LLVM-LABEL: vec_sldw_test_i2i8
 subroutine vec_sldw_test_i2i8(arg1, arg2)
   vector(integer(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -730,7 +730,7 @@ subroutine vec_sldw_test_i2i8(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i2i8
 
-! CHECK-LABEL: vec_sldw_test_i4i1
+! LLVM-LABEL: vec_sldw_test_i4i1
 subroutine vec_sldw_test_i4i1(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -752,7 +752,7 @@ subroutine vec_sldw_test_i4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i4i1
 
-! CHECK-LABEL: vec_sldw_test_i4i2
+! LLVM-LABEL: vec_sldw_test_i4i2
 subroutine vec_sldw_test_i4i2(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -774,7 +774,7 @@ subroutine vec_sldw_test_i4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i4i2
 
-! CHECK-LABEL: vec_sldw_test_i4i4
+! LLVM-LABEL: vec_sldw_test_i4i4
 subroutine vec_sldw_test_i4i4(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -796,7 +796,7 @@ subroutine vec_sldw_test_i4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i4i4
 
-! CHECK-LABEL: vec_sldw_test_i4i8
+! LLVM-LABEL: vec_sldw_test_i4i8
 subroutine vec_sldw_test_i4i8(arg1, arg2)
   vector(integer(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -818,7 +818,7 @@ subroutine vec_sldw_test_i4i8(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i4i8
 
-! CHECK-LABEL: vec_sldw_test_i8i1
+! LLVM-LABEL: vec_sldw_test_i8i1
 subroutine vec_sldw_test_i8i1(arg1, arg2)
   vector(integer(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -840,7 +840,7 @@ subroutine vec_sldw_test_i8i1(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i8i1
 
-! CHECK-LABEL: vec_sldw_test_i8i2
+! LLVM-LABEL: vec_sldw_test_i8i2
 subroutine vec_sldw_test_i8i2(arg1, arg2)
   vector(integer(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -862,7 +862,7 @@ subroutine vec_sldw_test_i8i2(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i8i2
 
-! CHECK-LABEL: vec_sldw_test_i8i4
+! LLVM-LABEL: vec_sldw_test_i8i4
 subroutine vec_sldw_test_i8i4(arg1, arg2)
   vector(integer(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -884,7 +884,7 @@ subroutine vec_sldw_test_i8i4(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_i8i4
 
-! CHECK-LABEL: vec_sldw_test_i8i8
+! LLVM-LABEL: vec_sldw_test_i8i8
 subroutine vec_sldw_test_i8i8(arg1, arg2)
   vector(integer(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -907,7 +907,7 @@ subroutine vec_sldw_test_i8i8(arg1, arg2)
 
 end subroutine vec_sldw_test_i8i8
 
-! CHECK-LABEL: vec_sldw_test_u1i1
+! LLVM-LABEL: vec_sldw_test_u1i1
 subroutine vec_sldw_test_u1i1(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -923,7 +923,7 @@ subroutine vec_sldw_test_u1i1(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u1i1
 
-! CHECK-LABEL: vec_sldw_test_u1i2
+! LLVM-LABEL: vec_sldw_test_u1i2
 subroutine vec_sldw_test_u1i2(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -939,7 +939,7 @@ subroutine vec_sldw_test_u1i2(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u1i2
 
-! CHECK-LABEL: vec_sldw_test_u1i4
+! LLVM-LABEL: vec_sldw_test_u1i4
 subroutine vec_sldw_test_u1i4(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -955,7 +955,7 @@ subroutine vec_sldw_test_u1i4(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u1i4
 
-! CHECK-LABEL: vec_sldw_test_u1i8
+! LLVM-LABEL: vec_sldw_test_u1i8
 subroutine vec_sldw_test_u1i8(arg1, arg2)
   vector(unsigned(1)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -971,7 +971,7 @@ subroutine vec_sldw_test_u1i8(arg1, arg2)
 ! BE-LLVMIR: store <16 x i8> %[[r]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u1i8
 
-! CHECK-LABEL: vec_sldw_test_u2i1
+! LLVM-LABEL: vec_sldw_test_u2i1
 subroutine vec_sldw_test_u2i1(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -993,7 +993,7 @@ subroutine vec_sldw_test_u2i1(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u2i1
 
-! CHECK-LABEL: vec_sldw_test_u2i2
+! LLVM-LABEL: vec_sldw_test_u2i2
 subroutine vec_sldw_test_u2i2(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -1015,7 +1015,7 @@ subroutine vec_sldw_test_u2i2(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u2i2
 
-! CHECK-LABEL: vec_sldw_test_u2i4
+! LLVM-LABEL: vec_sldw_test_u2i4
 subroutine vec_sldw_test_u2i4(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -1037,7 +1037,7 @@ subroutine vec_sldw_test_u2i4(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u2i4
 
-! CHECK-LABEL: vec_sldw_test_u2i8
+! LLVM-LABEL: vec_sldw_test_u2i8
 subroutine vec_sldw_test_u2i8(arg1, arg2)
   vector(unsigned(2)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -1059,7 +1059,7 @@ subroutine vec_sldw_test_u2i8(arg1, arg2)
 ! BE-LLVMIR: store <8 x i16> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u2i8
 
-! CHECK-LABEL: vec_sldw_test_u4i1
+! LLVM-LABEL: vec_sldw_test_u4i1
 subroutine vec_sldw_test_u4i1(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -1081,7 +1081,7 @@ subroutine vec_sldw_test_u4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u4i1
 
-! CHECK-LABEL: vec_sldw_test_u4i2
+! LLVM-LABEL: vec_sldw_test_u4i2
 subroutine vec_sldw_test_u4i2(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -1103,7 +1103,7 @@ subroutine vec_sldw_test_u4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u4i2
 
-! CHECK-LABEL: vec_sldw_test_u4i4
+! LLVM-LABEL: vec_sldw_test_u4i4
 subroutine vec_sldw_test_u4i4(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -1125,7 +1125,7 @@ subroutine vec_sldw_test_u4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u4i4
 
-! CHECK-LABEL: vec_sldw_test_u4i8
+! LLVM-LABEL: vec_sldw_test_u4i8
 subroutine vec_sldw_test_u4i8(arg1, arg2)
   vector(unsigned(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -1147,7 +1147,7 @@ subroutine vec_sldw_test_u4i8(arg1, arg2)
 ! BE-LLVMIR: store <4 x i32> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u4i8
 
-! CHECK-LABEL: vec_sldw_test_u8i1
+! LLVM-LABEL: vec_sldw_test_u8i1
 subroutine vec_sldw_test_u8i1(arg1, arg2)
   vector(unsigned(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -1169,7 +1169,7 @@ subroutine vec_sldw_test_u8i1(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u8i1
 
-! CHECK-LABEL: vec_sldw_test_u8i2
+! LLVM-LABEL: vec_sldw_test_u8i2
 subroutine vec_sldw_test_u8i2(arg1, arg2)
   vector(unsigned(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -1191,7 +1191,7 @@ subroutine vec_sldw_test_u8i2(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u8i2
 
-! CHECK-LABEL: vec_sldw_test_u8i4
+! LLVM-LABEL: vec_sldw_test_u8i4
 subroutine vec_sldw_test_u8i4(arg1, arg2)
   vector(unsigned(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -1213,7 +1213,7 @@ subroutine vec_sldw_test_u8i4(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u8i4
 
-! CHECK-LABEL: vec_sldw_test_u8i8
+! LLVM-LABEL: vec_sldw_test_u8i8
 subroutine vec_sldw_test_u8i8(arg1, arg2)
   vector(unsigned(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -1235,7 +1235,7 @@ subroutine vec_sldw_test_u8i8(arg1, arg2)
 ! BE-LLVMIR: store <2 x i64> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_u8i8
 
-! CHECK-LABEL: vec_sldw_test_r4i1
+! LLVM-LABEL: vec_sldw_test_r4i1
 subroutine vec_sldw_test_r4i1(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -1257,7 +1257,7 @@ subroutine vec_sldw_test_r4i1(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r4i1
 
-! CHECK-LABEL: vec_sldw_test_r4i2
+! LLVM-LABEL: vec_sldw_test_r4i2
 subroutine vec_sldw_test_r4i2(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -1279,7 +1279,7 @@ subroutine vec_sldw_test_r4i2(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r4i2
 
-! CHECK-LABEL: vec_sldw_test_r4i4
+! LLVM-LABEL: vec_sldw_test_r4i4
 subroutine vec_sldw_test_r4i4(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -1301,7 +1301,7 @@ subroutine vec_sldw_test_r4i4(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r4i4
 
-! CHECK-LABEL: vec_sldw_test_r4i8
+! LLVM-LABEL: vec_sldw_test_r4i8
 subroutine vec_sldw_test_r4i8(arg1, arg2)
   vector(real(4)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
@@ -1323,7 +1323,7 @@ subroutine vec_sldw_test_r4i8(arg1, arg2)
 ! BE-LLVMIR: store <4 x float> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r4i8
 
-! CHECK-LABEL: vec_sldw_test_r8i1
+! LLVM-LABEL: vec_sldw_test_r8i1
 subroutine vec_sldw_test_r8i1(arg1, arg2)
   vector(real(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_1)
@@ -1345,7 +1345,7 @@ subroutine vec_sldw_test_r8i1(arg1, arg2)
 ! BE-LLVMIR: store <2 x double> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r8i1
 
-! CHECK-LABEL: vec_sldw_test_r8i2
+! LLVM-LABEL: vec_sldw_test_r8i2
 subroutine vec_sldw_test_r8i2(arg1, arg2)
   vector(real(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_2)
@@ -1367,7 +1367,7 @@ subroutine vec_sldw_test_r8i2(arg1, arg2)
 ! BE-LLVMIR: store <2 x double> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r8i2
 
-! CHECK-LABEL: vec_sldw_test_r8i4
+! LLVM-LABEL: vec_sldw_test_r8i4
 subroutine vec_sldw_test_r8i4(arg1, arg2)
   vector(real(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_4)
@@ -1389,7 +1389,7 @@ subroutine vec_sldw_test_r8i4(arg1, arg2)
 ! BE-LLVMIR: store <2 x double> %[[br]], ptr %{{.*}}, align 16
 end subroutine vec_sldw_test_r8i4
 
-! CHECK-LABEL: vec_sldw_test_r8i8
+! LLVM-LABEL: vec_sldw_test_r8i8
 subroutine vec_sldw_test_r8i8(arg1, arg2)
   vector(real(8)) :: arg1, arg2, r
   r = vec_sldw(arg1, arg2, 3_8)
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 91b8cb71552a7..66b82c84dac49 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -87,4 +87,14 @@ add_proxy_header_library(
     libc.include.llvm-libc-macros.time_macros
 )
 
+add_proxy_header_library(
+  float_macros
+  HDRS
+    float_macros.h
+  DEPENDS
+    libc.include.llvm-libc-macros.float_macros
+  FULL_BUILD_DEPENDS
+    libc.include.float
+)
+
 add_subdirectory(types)
diff --git a/libc/hdr/float_macros.h b/libc/hdr/float_macros.h
new file mode 100644
index 0000000000000..a0ef5e29b9868
--- /dev/null
+++ b/libc/hdr/float_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from math.h ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_FLOAT_MACROS_H
+#define LLVM_LIBC_HDR_FLOAT_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/float-macros.h"
+
+#else // Overlay mode
+
+#include <float.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_FLOAT_MACROS_H
diff --git a/libc/include/llvm-libc-macros/float-macros.h b/libc/include/llvm-libc-macros/float-macros.h
index 4fe8590c5f70c..81c1df868bf6c 100644
--- a/libc/include/llvm-libc-macros/float-macros.h
+++ b/libc/include/llvm-libc-macros/float-macros.h
@@ -9,21 +9,6 @@
 #ifndef LLVM_LIBC_MACROS_FLOAT_MACROS_H
 #define LLVM_LIBC_MACROS_FLOAT_MACROS_H
 
-// Suppress `#include_next is a language extension` warnings.
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wgnu-include-next"
-#pragma clang diagnostic ignored "-Winclude-next-absolute-path"
-#else // gcc
-#pragma GCC system_header
-#endif //__clang__
-
-#include_next <float.h>
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif //__clang__
-
 #ifndef FLT_RADIX
 #define FLT_RADIX __FLT_RADIX__
 #endif // FLT_RADIX
@@ -32,9 +17,13 @@
 #define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
 #endif // FLT_EVAL_METHOD
 
-#ifndef DECIMAL_DIG
-#define DECIMAL_DIG __DECIMAL_DIG__
-#endif // DECIMAL_DIG
+#ifndef FLT_ROUNDS
+#if __has_builtin(__builtin_flt_rounds)
+#define FLT_ROUNDS __builtin_flt_rounds()
+#else
+#define FLT_ROUNDS 1
+#endif
+#endif // FLT_ROUNDS
 
 #ifndef FLT_DECIMAL_DIG
 #define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
@@ -48,6 +37,10 @@
 #define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
 #endif // LDBL_DECIMAL_DIG
 
+#ifndef DECIMAL_DIG
+#define DECIMAL_DIG __DECIMAL_DIG__
+#endif // DECIMAL_DIG
+
 #ifndef FLT_DIG
 #define FLT_DIG __FLT_DIG__
 #endif // FLT_DIG
@@ -97,15 +90,15 @@
 #endif // LDBL_MAX
 
 #ifndef FLT_TRUE_MIN
-#define FLT_TRUE_MIN __FLT_TRUE_MIN__
+#define FLT_TRUE_MIN __FLT_DENORM_MIN__
 #endif // FLT_TRUE_MIN
 
 #ifndef DBL_TRUE_MIN
-#define DBL_TRUE_MIN __DBL_TRUE_MIN__
+#define DBL_TRUE_MIN __DBL_DENORM_MIN__
 #endif // DBL_TRUE_MIN
 
 #ifndef LDBL_TRUE_MIN
-#define LDBL_TRUE_MIN __LDBL_TRUE_MIN__
+#define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
 #endif // LDBL_TRUE_MIN
 
 #ifndef FLT_EPSILON
diff --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt
index bbc45650f3fca..7718aeaa3de5a 100644
--- a/libc/src/__support/macros/properties/CMakeLists.txt
+++ b/libc/src/__support/macros/properties/CMakeLists.txt
@@ -33,6 +33,6 @@ add_header_library(
     .compiler
     .cpu_features
     .os
-    libc.include.llvm-libc-macros.float_macros
+    libc.hdr.float_macros
     libc.include.llvm-libc-types.float128
 )
diff --git a/libc/src/__support/macros/properties/types.h b/libc/src/__support/macros/properties/types.h
index d43cf99e6859b..781cf1b7a2b62 100644
--- a/libc/src/__support/macros/properties/types.h
+++ b/libc/src/__support/macros/properties/types.h
@@ -10,7 +10,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
 
-#include "include/llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
+#include "hdr/float_macros.h"                      // LDBL_MANT_DIG
 #include "include/llvm-libc-types/float128.h"      // float128
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index 39c4ad20201ca..f6913ef083428 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -75,4 +75,5 @@ add_object_library(
     libc.src.__support.OSUtil.osutil
     libc.src.__support.threads.linux.futex_word_type
     libc.src.__support.threads.mutex
+    libc.src.__support.CPP.mutex
 )
diff --git a/libc/src/__support/threads/linux/CndVar.cpp b/libc/src/__support/threads/linux/CndVar.cpp
index daf56bca1ed21..b3a0fdbda4e9e 100644
--- a/libc/src/__support/threads/linux/CndVar.cpp
+++ b/libc/src/__support/threads/linux/CndVar.cpp
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/threads/CndVar.h"
+#include "src/__support/CPP/mutex.h"
 #include "src/__support/OSUtil/syscall.h"           // syscall_impl
 #include "src/__support/threads/linux/futex_word.h" // FutexWordType
-#include "src/__support/threads/mutex.h"            // Mutex, MutexLock
+#include "src/__support/threads/mutex.h"            // Mutex
 
 #include <sys/syscall.h> // For syscall numbers.
 
@@ -27,7 +28,7 @@ int CndVar::wait(Mutex *m) {
 
   CndWaiter waiter;
   {
-    MutexLock ml(&qmtx);
+    cpp::lock_guard ml(qmtx);
     CndWaiter *old_back = nullptr;
     if (waitq_front == nullptr) {
       waitq_front = waitq_back = &waiter;
@@ -83,7 +84,7 @@ void CndVar::notify_one() {
 }
 
 void CndVar::broadcast() {
-  MutexLock ml(&qmtx);
+  cpp::lock_guard ml(qmtx);
   uint32_t dummy_futex_word;
   CndWaiter *waiter = waitq_front;
   waitq_front = waitq_back = nullptr;
diff --git a/libc/src/__support/threads/mutex.h b/libc/src/__support/threads/mutex.h
index 9dded2e3f952a..392b38984dc0a 100644
--- a/libc/src/__support/threads/mutex.h
+++ b/libc/src/__support/threads/mutex.h
@@ -43,18 +43,4 @@
 #include "src/__support/threads/gpu/mutex.h"
 #endif // __linux__
 
-namespace LIBC_NAMESPACE {
-
-// An RAII class for easy locking and unlocking of mutexes.
-class MutexLock {
-  Mutex *mutex;
-
-public:
-  explicit MutexLock(Mutex *m) : mutex(m) { mutex->lock(); }
-
-  ~MutexLock() { mutex->unlock(); }
-};
-
-} // namespace LIBC_NAMESPACE
-
 #endif // LLVM_LIBC_SRC___SUPPORT_THREADS_MUTEX_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index daaf505008ca1..269bc6be5d834 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -2933,6 +2933,7 @@ add_entrypoint_object(
   HDRS
     ../scalbn.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -2945,6 +2946,7 @@ add_entrypoint_object(
   HDRS
     ../scalbnf.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -2957,6 +2959,7 @@ add_entrypoint_object(
   HDRS
     ../scalbnl.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
@@ -2969,6 +2972,7 @@ add_entrypoint_object(
   HDRS
     ../scalbnf128.h
   DEPENDS
+    libc.hdr.float_macros
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
diff --git a/libc/src/math/generic/scalbn.cpp b/libc/src/math/generic/scalbn.cpp
index 3908f5892f144..207cce1550bc0 100644
--- a/libc/src/math/generic/scalbn.cpp
+++ b/libc/src/math/generic/scalbn.cpp
@@ -7,19 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbn.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, scalbn, (double x, int n)) {
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/scalbnf.cpp b/libc/src/math/generic/scalbnf.cpp
index 4a4fa86dcfd89..e478088d3ce5a 100644
--- a/libc/src/math/generic/scalbnf.cpp
+++ b/libc/src/math/generic/scalbnf.cpp
@@ -7,19 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbnf.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, scalbnf, (float x, int n)) {
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/scalbnf128.cpp b/libc/src/math/generic/scalbnf128.cpp
index be3d29ed27e98..5fd59611d53de 100644
--- a/libc/src/math/generic/scalbnf128.cpp
+++ b/libc/src/math/generic/scalbnf128.cpp
@@ -7,21 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbnf128.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float128, scalbnf128, (float128 x, int n)) {
-// TODO: should be switched to use `FLT_RADIX` in hdr/float_macros.h" instead
-// see: https://github.com/llvm/llvm-project/issues/90496
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/scalbnl.cpp b/libc/src/math/generic/scalbnl.cpp
index 681338ec01f07..1225a7ebaf572 100644
--- a/libc/src/math/generic/scalbnl.cpp
+++ b/libc/src/math/generic/scalbnl.cpp
@@ -7,19 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/scalbnl.h"
+#include "hdr/float_macros.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/__support/common.h"
 
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(long double, scalbnl, (long double x, int n)) {
-#if !defined(__FLT_RADIX__)
-#error __FLT_RADIX__ undefined.
-#elif __FLT_RADIX__ != 2
-#error __FLT_RADIX__!=2, unimplemented.
-#else
   return fputil::ldexp(x, n);
-#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index cb5e0e5e6cdb5..a061fda88b5c6 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -122,7 +122,7 @@ option(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS
    on definitions in a shared library. By default, we assume that we're not building
    libc++ for any specific vendor, and we disable those annotations. Vendors wishing
    to provide compile-time errors when using features unavailable on some version of
-   the shared library they shipped should turn this on and see `include/__availability`
+   the shared library they shipped should turn this on and see `include/__configuration/availability.h`
    for more details." OFF)
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -856,15 +856,14 @@ endfunction()
 #===============================================================================
 # Setup Source Code And Tests
 #===============================================================================
+add_custom_target(cxx-test-depends
+  COMMENT "Build dependencies required to run the libc++ test suite.")
+
 add_subdirectory(include)
 add_subdirectory(src)
 add_subdirectory(utils)
 add_subdirectory(modules)
 
-set(LIBCXX_TEST_DEPS "cxx_experimental")
-
-list(APPEND LIBCXX_TEST_DEPS generate-cxx-modules)
-
 if (LIBCXX_INCLUDE_BENCHMARKS)
   add_subdirectory(benchmarks)
 endif()
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 93b549a316e38..2101f9c71788c 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -252,10 +252,6 @@ endforeach()
 if (LIBCXX_INCLUDE_TESTS)
   include(AddLLVM)
 
-  if (NOT DEFINED LIBCXX_TEST_DEPS)
-    message(FATAL_ERROR "Expected LIBCXX_TEST_DEPS to be defined")
-  endif()
-
   configure_lit_site_cfg(
           ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
           ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py)
@@ -265,6 +261,6 @@ if (LIBCXX_INCLUDE_TESTS)
   add_lit_target(check-cxx-benchmarks
           "Running libcxx benchmarks tests"
           ${CMAKE_CURRENT_BINARY_DIR}
-          DEPENDS cxx-benchmarks ${LIBCXX_TEST_DEPS}
+          DEPENDS cxx-benchmarks cxx-test-depends
           ARGS ${BENCHMARK_LIT_ARGS})
 endif()
diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 17d2da907692e..0297068785e8b 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -326,8 +326,6 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_expected``                                     ``202211L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_format_path``                                  *unimplemented*
-    ---------------------------------------------------------- -----------------
     ``__cpp_lib_format_ranges``                                ``202207L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_formatters``                                   *unimplemented*
@@ -386,8 +384,6 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_string_resize_and_overwrite``                  ``202110L``
     ---------------------------------------------------------- -----------------
-    ``__cpp_lib_to_string``                                    *unimplemented*
-    ---------------------------------------------------------- -----------------
     ``__cpp_lib_to_underlying``                                ``202102L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_tuple_like``                                   *unimplemented*
@@ -412,6 +408,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_default_template_type_for_algorithm_values``   *unimplemented*
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_format_path``                                  *unimplemented*
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_freestanding_algorithm``                       *unimplemented*
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_freestanding_array``                           *unimplemented*
@@ -466,6 +464,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_to_chars``                                     *unimplemented*
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_to_string``                                    *unimplemented*
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_tuple_like``                                   *unimplemented*
     ========================================================== =================
 
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 5f83fa3a92e87..54517ab002b86 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -200,9 +200,9 @@
 "`3200 <https://wg21.link/LWG3200>`__","``midpoint``\  should not constrain ``T``\  is complete","Prague","|Nothing To Do|",""
 "`3201 <https://wg21.link/LWG3201>`__","``lerp``\  should be marked as ``noexcept``\ ","Prague","|Complete|",""
 "`3226 <https://wg21.link/LWG3226>`__","``zoned_time``\  constructor from ``string_view``\  should accept ``zoned_time<Duration2, TimeZonePtr2>``\ ","Prague","","","|chrono|"
-"`3233 <https://wg21.link/LWG3233>`__","Broken requirements for ``shared_ptr``\  converting constructors","Prague","",""
+"`3233 <https://wg21.link/LWG3233>`__","Broken requirements for ``shared_ptr``\  converting constructors","Prague","|Complete|","19.0"
 "`3237 <https://wg21.link/LWG3237>`__","LWG 3038 and 3190 have inconsistent PRs","Prague","|Complete|","16.0"
-"`3238 <https://wg21.link/LWG3238>`__","Insufficiently-defined behavior of ``std::function``\  deduction guides","Prague","",""
+"`3238 <https://wg21.link/LWG3238>`__","Insufficiently-defined behavior of ``std::function``\  deduction guides","Prague","|Nothing To Do|",""
 "`3242 <https://wg21.link/LWG3242>`__","``std::format``\ : missing rules for ``arg-id``\  in ``width``\  and ``precision``\ ","Prague","|Complete|","14.0","|format|"
 "`3243 <https://wg21.link/LWG3243>`__","``std::format``\  and negative zeroes","Prague","|Complete|","14.0","|format|"
 "`3247 <https://wg21.link/LWG3247>`__","``ranges::iter_move``\  should perform ADL-only lookup of ``iter_move``\ ","Prague","|Complete|","15.0","|ranges|"
@@ -285,7 +285,7 @@
 "`3379 <https://wg21.link/LWG3379>`__","""``safe``\ "" in several library names is misleading","Prague","|Complete|","15.0","|ranges|"
 "`3380 <https://wg21.link/LWG3380>`__","``common_type``\  and comparison categories","Prague","|Complete|","15.0","|spaceship|"
 "`3381 <https://wg21.link/LWG3381>`__","``begin``\  and ``data``\  must agree for ``contiguous_range``\ ","Prague","|Nothing To Do|","","|ranges|"
-"`3382 <https://wg21.link/LWG3382>`__","NTTP for ``pair``\  and ``array``\ ","Prague","",""
+"`3382 <https://wg21.link/LWG3382>`__","NTTP for ``pair``\  and ``array``\ ","Prague","|Nothing To Do|",""
 "`3383 <https://wg21.link/LWG3383>`__","|sect|\ [time.zone.leap.nonmembers] ``sys_seconds``\  should be replaced with ``seconds``\ ","Prague","|Complete|","19.0","|chrono|"
 "`3384 <https://wg21.link/LWG3384>`__","``transform_view::*sentinel*``\  has an incorrect ``operator-``\ ","Prague","|Complete|","15.0","|ranges|"
 "`3385 <https://wg21.link/LWG3385>`__","``common_iterator``\  is not sufficiently constrained for non-copyable iterators","Prague","|Complete|","15.0","|ranges|"
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 76717e1d3448a..8d24457186310 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -29,7 +29,7 @@
 "`3947 <https://wg21.link/LWG3947>`__","Unexpected constraints on ``adjacent_transform_view::base()``","Kona November 2023","","","|ranges|"
 "`3948 <https://wg21.link/LWG3948>`__","``possibly-const-range and as-const-pointer`` should be ``noexcept``","Kona November 2023","","","|ranges|"
 "`3949 <https://wg21.link/LWG3949>`__","``std::atomic<bool>``'s trivial destructor dropped in C++17 spec wording","Kona November 2023","","",""
-"`3951 <https://wg21.link/LWG3951>`__","[expected.object.swap]: Using ``value()`` instead of ``has_value()``","Kona November 2023","","",""
+"`3951 <https://wg21.link/LWG3951>`__","[expected.object.swap]: Using ``value()`` instead of ``has_value()``","Kona November 2023","|Complete|","16.0",""
 "`3953 <https://wg21.link/LWG3953>`__","``iter_move`` for ``common_iterator`` and ``counted_iterator`` should return ``decltype(auto)``","Kona November 2023","","","|ranges|"
 "`3957 <https://wg21.link/LWG3957>`__","[container.alloc.reqmts] The value category of v should be claimed","Kona November 2023","","",""
 "`3965 <https://wg21.link/LWG3965>`__","Incorrect example in [format.string.escaped] p3 for formatting of combining characters","Kona November 2023","|Complete|","19.0","|format|"
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 161d7a7d215bd..cfe1f44777bca 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -219,7 +219,6 @@ set(files
   __atomic/kill_dependency.h
   __atomic/memory_order.h
   __atomic/to_gcc_order.h
-  __availability
   __bit/bit_cast.h
   __bit/bit_ceil.h
   __bit/bit_floor.h
@@ -315,7 +314,9 @@ set(files
   __condition_variable/condition_variable.h
   __config
   __configuration/abi.h
+  __configuration/availability.h
   __configuration/compiler.h
+  __configuration/language.h
   __configuration/platform.h
   __coroutine/coroutine_handle.h
   __coroutine/coroutine_traits.h
diff --git a/libcxx/include/__atomic/atomic_base.h b/libcxx/include/__atomic/atomic_base.h
index e9badccc25a62..d7a5b99b54691 100644
--- a/libcxx/include/__atomic/atomic_base.h
+++ b/libcxx/include/__atomic/atomic_base.h
@@ -14,7 +14,6 @@
 #include <__atomic/cxx_atomic_impl.h>
 #include <__atomic/is_always_lock_free.h>
 #include <__atomic/memory_order.h>
-#include <__availability>
 #include <__config>
 #include <__memory/addressof.h>
 #include <__type_traits/is_integral.h>
diff --git a/libcxx/include/__atomic/atomic_flag.h b/libcxx/include/__atomic/atomic_flag.h
index 3ec3366ecaaf9..00b157cdff78b 100644
--- a/libcxx/include/__atomic/atomic_flag.h
+++ b/libcxx/include/__atomic/atomic_flag.h
@@ -13,7 +13,6 @@
 #include <__atomic/contention_t.h>
 #include <__atomic/cxx_atomic_impl.h>
 #include <__atomic/memory_order.h>
-#include <__availability>
 #include <__chrono/duration.h>
 #include <__config>
 #include <__memory/addressof.h>
diff --git a/libcxx/include/__atomic/atomic_sync.h b/libcxx/include/__atomic/atomic_sync.h
index 175700be54c01..1de5037329f81 100644
--- a/libcxx/include/__atomic/atomic_sync.h
+++ b/libcxx/include/__atomic/atomic_sync.h
@@ -13,7 +13,6 @@
 #include <__atomic/cxx_atomic_impl.h>
 #include <__atomic/memory_order.h>
 #include <__atomic/to_gcc_order.h>
-#include <__availability>
 #include <__chrono/duration.h>
 #include <__config>
 #include <__memory/addressof.h>
diff --git a/libcxx/include/__charconv/to_chars_floating_point.h b/libcxx/include/__charconv/to_chars_floating_point.h
index 08720e1078852..118f316b21a10 100644
--- a/libcxx/include/__charconv/to_chars_floating_point.h
+++ b/libcxx/include/__charconv/to_chars_floating_point.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___CHARCONV_TO_CHARS_FLOATING_POINT_H
 #define _LIBCPP___CHARCONV_TO_CHARS_FLOATING_POINT_H
 
-#include <__availability>
 #include <__charconv/chars_format.h>
 #include <__charconv/to_chars_result.h>
 #include <__config>
diff --git a/libcxx/include/__chrono/file_clock.h b/libcxx/include/__chrono/file_clock.h
index 7d25729fec013..4dd3f88ce5ba4 100644
--- a/libcxx/include/__chrono/file_clock.h
+++ b/libcxx/include/__chrono/file_clock.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___CHRONO_FILE_CLOCK_H
 #define _LIBCPP___CHRONO_FILE_CLOCK_H
 
-#include <__availability>
 #include <__chrono/duration.h>
 #include <__chrono/system_clock.h>
 #include <__chrono/time_point.h>
diff --git a/libcxx/include/__chrono/tzdb_list.h b/libcxx/include/__chrono/tzdb_list.h
index 62db7e3d2e0b5..aeef4fe1aba3c 100644
--- a/libcxx/include/__chrono/tzdb_list.h
+++ b/libcxx/include/__chrono/tzdb_list.h
@@ -16,7 +16,6 @@
 // Enable the contents of the header only when libc++ was built with experimental features enabled.
 #if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
 
-#  include <__availability>
 #  include <__chrono/time_zone.h>
 #  include <__chrono/tzdb.h>
 #  include <__config>
diff --git a/libcxx/include/__config b/libcxx/include/__config
index e048dad52c466..79422e8f6c5d1 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -12,6 +12,7 @@
 
 #include <__config_site>
 #include <__configuration/abi.h>
+#include <__configuration/availability.h>
 #include <__configuration/compiler.h>
 #include <__configuration/platform.h>
 
@@ -35,25 +36,6 @@
 #    define _LIBCPP_FREESTANDING
 #  endif
 
-// NOLINTBEGIN(libcpp-cpp-version-check)
-#  ifndef _LIBCPP_STD_VER
-#    if __cplusplus <= 201103L
-#      define _LIBCPP_STD_VER 11
-#    elif __cplusplus <= 201402L
-#      define _LIBCPP_STD_VER 14
-#    elif __cplusplus <= 201703L
-#      define _LIBCPP_STD_VER 17
-#    elif __cplusplus <= 202002L
-#      define _LIBCPP_STD_VER 20
-#    elif __cplusplus <= 202302L
-#      define _LIBCPP_STD_VER 23
-#    else
-// Expected release year of the next C++ standard
-#      define _LIBCPP_STD_VER 26
-#    endif
-#  endif // _LIBCPP_STD_VER
-// NOLINTEND(libcpp-cpp-version-check)
-
 // HARDENING {
 
 // TODO(hardening): deprecate this in LLVM 19.
@@ -364,10 +346,6 @@ typedef __char32_t char32_t;
 
 #  endif
 
-#  if !defined(__cpp_exceptions) || __cpp_exceptions < 199711L
-#    define _LIBCPP_HAS_NO_EXCEPTIONS
-#  endif
-
 #  define _LIBCPP_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp)
 
 #  if defined(_LIBCPP_COMPILER_CLANG_BASED)
@@ -840,11 +818,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_CONSTEXPR_SINCE_CXX23
 #  endif
 
-// Try to find out if RTTI is disabled.
-#  if !defined(__cpp_rtti) || __cpp_rtti < 199711L
-#    define _LIBCPP_HAS_NO_RTTI
-#  endif
-
 #  ifndef _LIBCPP_WEAK
 #    define _LIBCPP_WEAK __attribute__((__weak__))
 #  endif
diff --git a/libcxx/include/__availability b/libcxx/include/__configuration/availability.h
similarity index 98%
rename from libcxx/include/__availability
rename to libcxx/include/__configuration/availability.h
index e44ac1962df36..1115431115ec3 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__configuration/availability.h
@@ -7,10 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___AVAILABILITY
-#define _LIBCPP___AVAILABILITY
+#ifndef _LIBCPP___CONFIGURATION_AVAILABILITY_H
+#define _LIBCPP___CONFIGURATION_AVAILABILITY_H
 
-#include <__config>
+#include <__configuration/compiler.h>
+#include <__configuration/language.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -374,4 +375,4 @@
 #  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION
 #endif
 
-#endif // _LIBCPP___AVAILABILITY
+#endif // _LIBCPP___CONFIGURATION_AVAILABILITY_H
diff --git a/libcxx/include/__configuration/language.h b/libcxx/include/__configuration/language.h
new file mode 100644
index 0000000000000..fa62a7b6f5c2a
--- /dev/null
+++ b/libcxx/include/__configuration/language.h
@@ -0,0 +1,46 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CONFIGURATION_LANGUAGE_H
+#define _LIBCPP___CONFIGURATION_LANGUAGE_H
+
+#include <__config_site>
+
+#ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER
+#  pragma GCC system_header
+#endif
+
+// NOLINTBEGIN(libcpp-cpp-version-check)
+#ifdef __cplusplus
+#  if __cplusplus <= 201103L
+#    define _LIBCPP_STD_VER 11
+#  elif __cplusplus <= 201402L
+#    define _LIBCPP_STD_VER 14
+#  elif __cplusplus <= 201703L
+#    define _LIBCPP_STD_VER 17
+#  elif __cplusplus <= 202002L
+#    define _LIBCPP_STD_VER 20
+#  elif __cplusplus <= 202302L
+#    define _LIBCPP_STD_VER 23
+#  else
+// Expected release year of the next C++ standard
+#    define _LIBCPP_STD_VER 26
+#  endif
+#endif // __cplusplus
+// NOLINTEND(libcpp-cpp-version-check)
+
+#if !defined(__cpp_rtti) || __cpp_rtti < 199711L
+#  define _LIBCPP_HAS_NO_RTTI
+#endif
+
+#if !defined(__cpp_exceptions) || __cpp_exceptions < 199711L
+#  define _LIBCPP_HAS_NO_EXCEPTIONS
+#endif
+
+#endif // _LIBCPP___CONFIGURATION_LANGUAGE_H
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index 868fd7c015339..0a8337fa39de3 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___EXCEPTION_EXCEPTION_PTR_H
 #define _LIBCPP___EXCEPTION_EXCEPTION_PTR_H
 
-#include <__availability>
 #include <__config>
 #include <__exception/operations.h>
 #include <__memory/addressof.h>
diff --git a/libcxx/include/__expected/bad_expected_access.h b/libcxx/include/__expected/bad_expected_access.h
index ef29fa5088313..1b734389e8311 100644
--- a/libcxx/include/__expected/bad_expected_access.h
+++ b/libcxx/include/__expected/bad_expected_access.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H
 #define _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H
 
-#include <__availability>
 #include <__config>
 #include <__exception/exception.h>
 #include <__utility/move.h>
diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h
index 016ad94a853dc..96d88dcd90b4c 100644
--- a/libcxx/include/__filesystem/directory_entry.h
+++ b/libcxx/include/__filesystem/directory_entry.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___FILESYSTEM_DIRECTORY_ENTRY_H
 #define _LIBCPP___FILESYSTEM_DIRECTORY_ENTRY_H
 
-#include <__availability>
 #include <__chrono/time_point.h>
 #include <__compare/ordering.h>
 #include <__config>
diff --git a/libcxx/include/__filesystem/directory_iterator.h b/libcxx/include/__filesystem/directory_iterator.h
index a5aa5ff5432da..e0246d8001e19 100644
--- a/libcxx/include/__filesystem/directory_iterator.h
+++ b/libcxx/include/__filesystem/directory_iterator.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FILESYSTEM_DIRECTORY_ITERATOR_H
 
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__filesystem/directory_entry.h>
 #include <__filesystem/directory_options.h>
diff --git a/libcxx/include/__filesystem/filesystem_error.h b/libcxx/include/__filesystem/filesystem_error.h
index bfdcc5eaee521..80a11e3b1932c 100644
--- a/libcxx/include/__filesystem/filesystem_error.h
+++ b/libcxx/include/__filesystem/filesystem_error.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___FILESYSTEM_FILESYSTEM_ERROR_H
 #define _LIBCPP___FILESYSTEM_FILESYSTEM_ERROR_H
 
-#include <__availability>
 #include <__config>
 #include <__filesystem/path.h>
 #include <__memory/shared_ptr.h>
diff --git a/libcxx/include/__filesystem/operations.h b/libcxx/include/__filesystem/operations.h
index 9bb83576f54bc..f588189ed1d9d 100644
--- a/libcxx/include/__filesystem/operations.h
+++ b/libcxx/include/__filesystem/operations.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___FILESYSTEM_OPERATIONS_H
 #define _LIBCPP___FILESYSTEM_OPERATIONS_H
 
-#include <__availability>
 #include <__chrono/time_point.h>
 #include <__config>
 #include <__filesystem/copy_options.h>
diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h
index 89d319b4b19b5..ff468d517722f 100644
--- a/libcxx/include/__filesystem/path.h
+++ b/libcxx/include/__filesystem/path.h
@@ -12,7 +12,6 @@
 
 #include <__algorithm/replace.h>
 #include <__algorithm/replace_copy.h>
-#include <__availability>
 #include <__config>
 #include <__functional/unary_function.h>
 #include <__fwd/functional.h>
diff --git a/libcxx/include/__filesystem/path_iterator.h b/libcxx/include/__filesystem/path_iterator.h
index d2d65cd122cab..f4d486d86cf38 100644
--- a/libcxx/include/__filesystem/path_iterator.h
+++ b/libcxx/include/__filesystem/path_iterator.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FILESYSTEM_PATH_ITERATOR_H
 
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__filesystem/path.h>
 #include <__iterator/iterator_traits.h>
diff --git a/libcxx/include/__filesystem/recursive_directory_iterator.h b/libcxx/include/__filesystem/recursive_directory_iterator.h
index a8af4f73b14a5..caa1396eb301f 100644
--- a/libcxx/include/__filesystem/recursive_directory_iterator.h
+++ b/libcxx/include/__filesystem/recursive_directory_iterator.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___FILESYSTEM_RECURSIVE_DIRECTORY_ITERATOR_H
 #define _LIBCPP___FILESYSTEM_RECURSIVE_DIRECTORY_ITERATOR_H
 
-#include <__availability>
 #include <__config>
 #include <__filesystem/directory_entry.h>
 #include <__filesystem/directory_options.h>
diff --git a/libcxx/include/__filesystem/u8path.h b/libcxx/include/__filesystem/u8path.h
index bde878054865e..dae5823128f02 100644
--- a/libcxx/include/__filesystem/u8path.h
+++ b/libcxx/include/__filesystem/u8path.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FILESYSTEM_U8PATH_H
 
 #include <__algorithm/unwrap_iter.h>
-#include <__availability>
 #include <__config>
 #include <__filesystem/path.h>
 #include <string>
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index 36057706933d4..244e55be3403c 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FUNCTIONAL_FUNCTION_H
 
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__exception/exception.h>
 #include <__functional/binary_function.h>
diff --git a/libcxx/include/__fwd/memory_resource.h b/libcxx/include/__fwd/memory_resource.h
index 03b78ad2bd3c0..d68b2c2b63154 100644
--- a/libcxx/include/__fwd/memory_resource.h
+++ b/libcxx/include/__fwd/memory_resource.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___FWD_MEMORY_RESOURCE_H
 #define _LIBCPP___FWD_MEMORY_RESOURCE_H
 
-#include <__availability>
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__fwd/string.h b/libcxx/include/__fwd/string.h
index 320c4e4c81836..2418e1f9b23d0 100644
--- a/libcxx/include/__fwd/string.h
+++ b/libcxx/include/__fwd/string.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___FWD_STRING_H
 #define _LIBCPP___FWD_STRING_H
 
-#include <__availability>
 #include <__config>
 #include <__fwd/memory.h>
 #include <__fwd/memory_resource.h>
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 992b1ba43f100..de5707c4a67b0 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -403,6 +403,9 @@ struct __shared_ptr_deleter_ctor_reqs {
                             __well_formed_deleter<_Dp, _Yp*>::value;
 };
 
+template <class _Dp, class _Tp>
+using __shared_ptr_nullptr_deleter_ctor_reqs = _And<is_move_constructible<_Dp>, __well_formed_deleter<_Dp, nullptr_t> >;
+
 #if defined(_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI)
 #  define _LIBCPP_SHARED_PTR_TRIVIAL_ABI __attribute__((__trivial_abi__))
 #else
@@ -498,7 +501,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
   }
 
-  template <class _Dp>
+  template <class _Dp, __enable_if_t<__shared_ptr_nullptr_deleter_ctor_reqs<_Dp, _Tp>::value, int> = 0 >
   _LIBCPP_HIDE_FROM_ABI shared_ptr(nullptr_t __p, _Dp __d) : __ptr_(nullptr) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
     try {
@@ -518,7 +521,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
   }
 
-  template <class _Dp, class _Alloc>
+  template <class _Dp, class _Alloc, __enable_if_t<__shared_ptr_nullptr_deleter_ctor_reqs<_Dp, _Tp>::value, int> = 0 >
   _LIBCPP_HIDE_FROM_ABI shared_ptr(nullptr_t __p, _Dp __d, _Alloc __a) : __ptr_(nullptr) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
     try {
diff --git a/libcxx/include/__memory_resource/memory_resource.h b/libcxx/include/__memory_resource/memory_resource.h
index e605838bf5ea4..ea85e50cd568b 100644
--- a/libcxx/include/__memory_resource/memory_resource.h
+++ b/libcxx/include/__memory_resource/memory_resource.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___MEMORY_RESOURCE_MEMORY_RESOURCE_H
 #define _LIBCPP___MEMORY_RESOURCE_MEMORY_RESOURCE_H
 
-#include <__availability>
 #include <__config>
 #include <__fwd/memory_resource.h>
 #include <cstddef>
diff --git a/libcxx/include/__memory_resource/monotonic_buffer_resource.h b/libcxx/include/__memory_resource/monotonic_buffer_resource.h
index 0c83f1ebc8db4..f45b30fdb3861 100644
--- a/libcxx/include/__memory_resource/monotonic_buffer_resource.h
+++ b/libcxx/include/__memory_resource/monotonic_buffer_resource.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___MEMORY_RESOURCE_MONOTONIC_BUFFER_RESOURCE_H
 #define _LIBCPP___MEMORY_RESOURCE_MONOTONIC_BUFFER_RESOURCE_H
 
-#include <__availability>
 #include <__config>
 #include <__memory/addressof.h>
 #include <__memory_resource/memory_resource.h>
diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h
index 8fda201124387..a71096d3e4784 100644
--- a/libcxx/include/__memory_resource/polymorphic_allocator.h
+++ b/libcxx/include/__memory_resource/polymorphic_allocator.h
@@ -10,7 +10,6 @@
 #define _LIBCPP___MEMORY_RESOURCE_POLYMORPHIC_ALLOCATOR_H
 
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__fwd/pair.h>
 #include <__memory_resource/memory_resource.h>
diff --git a/libcxx/include/__memory_resource/synchronized_pool_resource.h b/libcxx/include/__memory_resource/synchronized_pool_resource.h
index b261fb0b194a8..50a673c2861d1 100644
--- a/libcxx/include/__memory_resource/synchronized_pool_resource.h
+++ b/libcxx/include/__memory_resource/synchronized_pool_resource.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___MEMORY_RESOURCE_SYNCHRONIZED_POOL_RESOURCE_H
 #define _LIBCPP___MEMORY_RESOURCE_SYNCHRONIZED_POOL_RESOURCE_H
 
-#include <__availability>
 #include <__config>
 #include <__memory_resource/memory_resource.h>
 #include <__memory_resource/pool_options.h>
diff --git a/libcxx/include/__memory_resource/unsynchronized_pool_resource.h b/libcxx/include/__memory_resource/unsynchronized_pool_resource.h
index 81d5f9ec4da87..783db84262af7 100644
--- a/libcxx/include/__memory_resource/unsynchronized_pool_resource.h
+++ b/libcxx/include/__memory_resource/unsynchronized_pool_resource.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___MEMORY_RESOURCE_UNSYNCHRONIZED_POOL_RESOURCE_H
 #define _LIBCPP___MEMORY_RESOURCE_UNSYNCHRONIZED_POOL_RESOURCE_H
 
-#include <__availability>
 #include <__config>
 #include <__memory_resource/memory_resource.h>
 #include <__memory_resource/pool_options.h>
diff --git a/libcxx/include/__ostream/print.h b/libcxx/include/__ostream/print.h
index 97680cdab6da3..8265ac00777e2 100644
--- a/libcxx/include/__ostream/print.h
+++ b/libcxx/include/__ostream/print.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___OSTREAM_PRINT_H
 #define _LIBCPP___OSTREAM_PRINT_H
 
-#include <__availability>
 #include <__config>
 #include <__fwd/ostream.h>
 #include <__iterator/ostreambuf_iterator.h>
diff --git a/libcxx/include/__stop_token/stop_callback.h b/libcxx/include/__stop_token/stop_callback.h
index 7b526820f98a3..760cf2bb55b0c 100644
--- a/libcxx/include/__stop_token/stop_callback.h
+++ b/libcxx/include/__stop_token/stop_callback.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___STOP_TOKEN_STOP_CALLBACK_H
 #define _LIBCPP___STOP_TOKEN_STOP_CALLBACK_H
 
-#include <__availability>
 #include <__concepts/constructible.h>
 #include <__concepts/destructible.h>
 #include <__concepts/invocable.h>
diff --git a/libcxx/include/__stop_token/stop_source.h b/libcxx/include/__stop_token/stop_source.h
index 1080069cf3b8b..70697462784ab 100644
--- a/libcxx/include/__stop_token/stop_source.h
+++ b/libcxx/include/__stop_token/stop_source.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___STOP_TOKEN_STOP_SOURCE_H
 #define _LIBCPP___STOP_TOKEN_STOP_SOURCE_H
 
-#include <__availability>
 #include <__config>
 #include <__stop_token/intrusive_shared_ptr.h>
 #include <__stop_token/stop_state.h>
diff --git a/libcxx/include/__stop_token/stop_state.h b/libcxx/include/__stop_token/stop_state.h
index df07573f87862..b0eed13a143cf 100644
--- a/libcxx/include/__stop_token/stop_state.h
+++ b/libcxx/include/__stop_token/stop_state.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___STOP_TOKEN_STOP_STATE_H
 
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__stop_token/atomic_unique_lock.h>
 #include <__stop_token/intrusive_list_view.h>
diff --git a/libcxx/include/__stop_token/stop_token.h b/libcxx/include/__stop_token/stop_token.h
index f2eadb990bdec..1bd75cbbf6f8d 100644
--- a/libcxx/include/__stop_token/stop_token.h
+++ b/libcxx/include/__stop_token/stop_token.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___STOP_TOKEN_STOP_TOKEN_H
 #define _LIBCPP___STOP_TOKEN_STOP_TOKEN_H
 
-#include <__availability>
 #include <__config>
 #include <__stop_token/intrusive_shared_ptr.h>
 #include <__stop_token/stop_state.h>
diff --git a/libcxx/include/__thread/jthread.h b/libcxx/include/__thread/jthread.h
index 253e3a935d9b7..b3d5c25fb71c7 100644
--- a/libcxx/include/__thread/jthread.h
+++ b/libcxx/include/__thread/jthread.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___THREAD_JTHREAD_H
 #define _LIBCPP___THREAD_JTHREAD_H
 
-#include <__availability>
 #include <__config>
 #include <__functional/invoke.h>
 #include <__stop_token/stop_source.h>
diff --git a/libcxx/include/__thread/poll_with_backoff.h b/libcxx/include/__thread/poll_with_backoff.h
index d8354e6ca2398..4f961fe3f7629 100644
--- a/libcxx/include/__thread/poll_with_backoff.h
+++ b/libcxx/include/__thread/poll_with_backoff.h
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___THREAD_POLL_WITH_BACKOFF_H
 #define _LIBCPP___THREAD_POLL_WITH_BACKOFF_H
 
-#include <__availability>
 #include <__chrono/duration.h>
 #include <__chrono/high_resolution_clock.h>
 #include <__config>
diff --git a/libcxx/include/__verbose_abort b/libcxx/include/__verbose_abort
index 259c70dda8fe8..1e2265a6bf755 100644
--- a/libcxx/include/__verbose_abort
+++ b/libcxx/include/__verbose_abort
@@ -10,7 +10,6 @@
 #ifndef _LIBCPP___VERBOSE_ABORT
 #define _LIBCPP___VERBOSE_ABORT
 
-#include <__availability>
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index a6b4d2288309e..bce67bb5d3425 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -54,7 +54,6 @@ namespace std
 #include <__assert>
 #include <__atomic/atomic_base.h>
 #include <__atomic/memory_order.h>
-#include <__availability>
 #include <__memory/unique_ptr.h>
 #include <__thread/poll_with_backoff.h>
 #include <__thread/timed_backoff_policy.h>
diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable
index 4ded1140d46b1..5195cd6057dd3 100644
--- a/libcxx/include/condition_variable
+++ b/libcxx/include/condition_variable
@@ -118,7 +118,6 @@ public:
 
 */
 
-#include <__availability>
 #include <__chrono/duration.h>
 #include <__chrono/steady_clock.h>
 #include <__chrono/time_point.h>
diff --git a/libcxx/include/deque b/libcxx/include/deque
index 3c33e04e9f05f..555761aae6afd 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -189,7 +189,6 @@ template <class T, class Allocator, class Predicate>
 #include <__algorithm/remove_if.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__debug_utils/sanitizers.h>
 #include <__format/enable_insertable.h>
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 80dd49fe3d75a..363931e3f2388 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -199,7 +199,6 @@ template <class T, class Allocator, class Predicate>
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/min.h>
-#include <__availability>
 #include <__config>
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index 7128f72e16119..18f4dd3eed0b2 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -188,7 +188,6 @@ typedef basic_fstream<wchar_t> wfstream;
 
 #include <__algorithm/max.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__fwd/fstream.h>
 #include <__locale>
diff --git a/libcxx/include/latch b/libcxx/include/latch
index 1937617f7dcc6..da8dae149c79f 100644
--- a/libcxx/include/latch
+++ b/libcxx/include/latch
@@ -50,7 +50,6 @@ namespace std
 #include <__atomic/atomic_base.h>
 #include <__atomic/atomic_sync.h>
 #include <__atomic/memory_order.h>
-#include <__availability>
 #include <cstddef>
 #include <limits>
 #include <version>
diff --git a/libcxx/include/list b/libcxx/include/list
index 610a24e384600..87f15e144ac8f 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -203,7 +203,6 @@ template <class T, class Allocator, class Predicate>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/min.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__format/enable_insertable.h>
 #include <__iterator/distance.h>
diff --git a/libcxx/include/map b/libcxx/include/map
index 1d1c062a0267c..7efa715e84aa7 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -575,7 +575,6 @@ erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__functional/binary_function.h>
 #include <__functional/is_transparent.h>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 1f7c2a183f63d..48391b2a12095 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -570,10 +570,6 @@ module std_private_assert            [system] {
   header "__assert"
   export *
 }
-module std_private_availability      [system] {
-  header "__availability"
-  export *
-}
 module std_private_bit_reference     [system] {
   header "__bit_reference"
   export *
@@ -584,7 +580,9 @@ module std_private_fwd_bit_reference [system] {
 module std_private_config            [system] {
   textual header "__config"
   textual header "__configuration/abi.h"
+  textual header "__configuration/availability.h"
   textual header "__configuration/compiler.h"
+  textual header "__configuration/language.h"
   textual header "__configuration/platform.h"
   export *
 }
diff --git a/libcxx/include/optional b/libcxx/include/optional
index a16e48502e250..622e150f7a9f7 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -178,7 +178,6 @@ namespace std {
 */
 
 #include <__assert>
-#include <__availability>
 #include <__compare/compare_three_way_result.h>
 #include <__compare/three_way_comparable.h>
 #include <__concepts/invocable.h>
diff --git a/libcxx/include/print b/libcxx/include/print
index e0bcf214ea239..5bdaa559af724 100644
--- a/libcxx/include/print
+++ b/libcxx/include/print
@@ -34,7 +34,6 @@ namespace std {
 */
 
 #include <__assert>
-#include <__availability>
 #include <__concepts/same_as.h>
 #include <__config>
 #include <__system_error/system_error.h>
diff --git a/libcxx/include/regex b/libcxx/include/regex
index ce9f34260254a..b3869d36de1df 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -792,7 +792,6 @@ typedef regex_token_iterator<wstring::const_iterator> wsregex_token_iterator;
 #include <__algorithm/find.h>
 #include <__algorithm/search.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__iterator/back_insert_iterator.h>
 #include <__iterator/default_sentinel.h>
diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore
index cb2f42c106ca8..8d3b04475c092 100644
--- a/libcxx/include/semaphore
+++ b/libcxx/include/semaphore
@@ -55,7 +55,6 @@ using binary_semaphore = counting_semaphore<1>;
 #include <__atomic/atomic_base.h>
 #include <__atomic/atomic_sync.h>
 #include <__atomic/memory_order.h>
-#include <__availability>
 #include <__chrono/time_point.h>
 #include <__thread/poll_with_backoff.h>
 #include <__thread/support.h>
diff --git a/libcxx/include/set b/libcxx/include/set
index d9377ee6c3322..ab3a4363499af 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -516,7 +516,6 @@ erase_if(multiset<Key, Compare, Allocator>& c, Predicate pred);  // C++20
 #include <__algorithm/lexicographical_compare.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index 5009fe5c0057b..9ba43ffeb850f 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -312,7 +312,6 @@ typedef basic_stringstream<wchar_t> wstringstream;
 
 // clang-format on
 
-#include <__availability>
 #include <__config>
 #include <__fwd/sstream.h>
 #include <__ostream/basic_ostream.h>
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index c838cd96b1123..2e25b0f050695 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -585,7 +585,6 @@ template <class Key, class T, class Hash, class Pred, class Alloc>
 
 #include <__algorithm/is_permutation.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index 5de1458beb1e6..c966cc8eb4df1 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -533,7 +533,6 @@ template <class Value, class Hash, class Pred, class Alloc>
 
 #include <__algorithm/is_permutation.h>
 #include <__assert>
-#include <__availability>
 #include <__config>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 631ffceab5f68..7ebd0534b1641 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -212,7 +212,6 @@ namespace std {
 
 */
 
-#include <__availability>
 #include <__compare/common_comparison_category.h>
 #include <__compare/compare_three_way_result.h>
 #include <__compare/three_way_comparable.h>
diff --git a/libcxx/include/vector b/libcxx/include/vector
index b190557fb7b7e..cbfc2cefa1fd9 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -316,7 +316,6 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__algorithm/rotate.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__assert>
-#include <__availability>
 #include <__bit_reference>
 #include <__concepts/same_as.h>
 #include <__config>
diff --git a/libcxx/include/version b/libcxx/include/version
index 69556d731f1cf..d433e1b1c9cea 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -255,7 +255,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 
 */
 
-#include <__availability>
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -459,7 +458,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_constexpr_typeinfo                   202106L
 # define __cpp_lib_containers_ranges                    202202L
 # define __cpp_lib_expected                             202211L
-// # define __cpp_lib_format_path                          202403L
 # define __cpp_lib_format_ranges                        202207L
 // # define __cpp_lib_formatters                           202302L
 # define __cpp_lib_forward_like                         202207L
@@ -490,7 +488,6 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_stdatomic_h                          202011L
 # define __cpp_lib_string_contains                      202011L
 # define __cpp_lib_string_resize_and_overwrite          202110L
-// # define __cpp_lib_to_string                            202306L
 # define __cpp_lib_to_underlying                        202102L
 // # define __cpp_lib_tuple_like                           202207L
 # define __cpp_lib_unreachable                          202202L
@@ -506,6 +503,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cpp_lib_copyable_function                    202306L
 // # define __cpp_lib_debugging                            202311L
 // # define __cpp_lib_default_template_type_for_algorithm_values 202403L
+// # define __cpp_lib_format_path                          202403L
 // # define __cpp_lib_freestanding_algorithm               202311L
 // # define __cpp_lib_freestanding_array                   202311L
 // # define __cpp_lib_freestanding_cstring                 202306L
@@ -537,6 +535,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cpp_lib_text_encoding                        202306L
 # undef  __cpp_lib_to_chars
 // # define __cpp_lib_to_chars                             202306L
+// # define __cpp_lib_to_string                            202306L
 # undef  __cpp_lib_tuple_like
 // # define __cpp_lib_tuple_like                           202311L
 #endif
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index d47d19a475531..82cd7b66beb7a 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -202,6 +202,7 @@ add_custom_target(generate-cxx-modules
   ALL DEPENDS
     ${_all_modules}
 )
+add_dependencies(cxx-test-depends generate-cxx-modules)
 
 # Configure the modules manifest.
 # Use the relative path between the installation and the module in the json
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 8b28d1b891895..65e6ce2c4da43 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -322,6 +322,7 @@ endif()
 
 # Add a meta-target for both libraries.
 add_custom_target(cxx DEPENDS ${LIBCXX_BUILD_TARGETS})
+add_dependencies(cxx-test-depends cxx)
 
 set(LIBCXX_EXPERIMENTAL_SOURCES
   experimental/keep.cpp
@@ -366,6 +367,7 @@ set_target_properties(cxx_experimental
 )
 cxx_add_common_build_flags(cxx_experimental)
 target_compile_options(cxx_experimental PUBLIC -D_LIBCPP_ENABLE_EXPERIMENTAL)
+add_dependencies(cxx-test-depends cxx_experimental)
 
 if (LIBCXX_INSTALL_SHARED_LIBRARY)
   install(TARGETS cxx_shared
diff --git a/libcxx/src/optional.cpp b/libcxx/src/optional.cpp
index 6ba63f2d89f5a..62b474a312be2 100644
--- a/libcxx/src/optional.cpp
+++ b/libcxx/src/optional.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <__availability>
 #include <optional>
 #include <stdexcept>
 
diff --git a/libcxx/src/ostream.cpp b/libcxx/src/ostream.cpp
index 443dce9a390be..e1a9a4bc1de71 100644
--- a/libcxx/src/ostream.cpp
+++ b/libcxx/src/ostream.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <__availability>
 #include <__config>
 #ifndef _LIBCPP_HAS_NO_FILESYSTEM
 #  include <fstream>
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index fd57aa9fe8b37..3c54a4edccff3 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -1,11 +1,5 @@
 include(HandleLitArguments)
 add_subdirectory(tools)
-# When the tools add clang-tidy support, the dependencies need to be updated.
-# This cannot be done in the tools CMakeLists.txt since that does not update
-# the status in this (a parent) directory.
-if(TARGET cxx-tidy)
-  list(APPEND LIBCXX_TEST_DEPS cxx-tidy)
-endif()
 
 # By default, libcxx and libcxxabi share a library directory.
 if (NOT LIBCXX_CXX_ABI_LIBRARY_PATH)
@@ -16,6 +10,8 @@ endif()
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxx configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
+serialize_lit_string_param(SERIALIZED_LIT_PARAMS compiler "${CMAKE_CXX_COMPILER}")
+
 if (NOT LIBCXX_ENABLE_EXCEPTIONS)
   serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False)
 endif()
@@ -38,10 +34,6 @@ endif()
 
 serialize_lit_params_list(SERIALIZED_LIT_PARAMS LIBCXX_TEST_PARAMS)
 
-if (NOT DEFINED LIBCXX_TEST_DEPS)
-  message(FATAL_ERROR "Expected LIBCXX_TEST_DEPS to be defined")
-endif()
-
 if (MSVC)
   # Shared code for initializing some parameters used by all
   # llvm-libc++-*-clangcl.cfg.in test configs.
@@ -79,10 +71,6 @@ if (LIBCXX_INCLUDE_TESTS)
     ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
     MAIN_CONFIG "${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py")
 
-  add_custom_target(cxx-test-depends
-    DEPENDS cxx ${LIBCXX_TEST_DEPS}
-    COMMENT "Builds dependencies required to run the test suite.")
-
   add_lit_testsuite(check-cxx
     "Running libcxx tests"
     ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in
index 84b3270a8940a..78d0cb5a25748 100644
--- a/libcxx/test/configs/cmake-bridge.cfg.in
+++ b/libcxx/test/configs/cmake-bridge.cfg.in
@@ -23,8 +23,6 @@ config.recursiveExpansionLimit = 10
 config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test')
 
 # Add substitutions for bootstrapping the test suite configuration
-import shlex
-config.substitutions.append(('%{cxx}', shlex.quote('@CMAKE_CXX_COMPILER@')))
 config.substitutions.append(('%{libcxx-dir}', '@LIBCXX_SOURCE_DIR@'))
 config.substitutions.append(('%{include-dir}', '@LIBCXX_GENERATED_INCLUDE_DIR@'))
 config.substitutions.append(('%{target-include-dir}', '@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@'))
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
index 3ee213358f352..08c682964c374 100644
--- a/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
@@ -73,7 +73,7 @@ L link link_to_link
   TEST_VALIDATE_EXCEPTION(
       std::runtime_error,
       [&]([[maybe_unused]] const std::runtime_error& e) {
-        std::string_view what{"tzdb: requested time zone not found"};
+        [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"};
         TEST_LIBCPP_REQUIRE(
             e.what() == what,
             TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
diff --git a/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp
index c55a0a4d6e5d1..60723bf7b6e97 100644
--- a/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp
+++ b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp
@@ -15,7 +15,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -pedantic-errors
 
-#include <__availability>
+#include <__config>
 
 #if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 #  error Availability annotations should be enabled on Apple platforms in the system configuration!
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
index 761691c2afdcb..890ac23fff832 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains_subrange.pass.cpp
@@ -24,6 +24,7 @@
 //                                              Proj1 proj1 = {}, Proj2 proj2 = {});                 // since C++23
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <concepts>
 #include <ranges>
@@ -130,10 +131,10 @@ constexpr void test_iterators() {
   }
 
   { // range has zero length
-    int a[]       = {};
-    int p[]       = {3, 4, 2};
-    auto whole    = std::ranges::subrange(Iter1(a), Sent1(Iter1(a)));
-    auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(std::end(p))));
+    std::array<int, 0> a = {};
+    int p[]              = {3, 4, 2};
+    auto whole           = std::ranges::subrange(Iter1(a.data()), Sent1(Iter1(a.data())));
+    auto subrange        = std::ranges::subrange(Iter2(p), Sent2(Iter2(std::end(p))));
     {
       bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end());
       assert(!ret);
@@ -145,10 +146,10 @@ constexpr void test_iterators() {
   }
 
   { // subrange has zero length
-    int a[]       = {3, 4, 2};
-    int p[]       = {};
-    auto whole    = std::ranges::subrange(Iter1(a), Sent1(Iter1(std::end(a))));
-    auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(p)));
+    int a[]              = {3, 4, 2};
+    std::array<int, 0> p = {};
+    auto whole           = std::ranges::subrange(Iter1(a), Sent1(Iter1(std::end(a))));
+    auto subrange        = std::ranges::subrange(Iter2(p.data()), Sent2(Iter2(p.data())));
     {
       bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end());
       assert(ret);
@@ -160,10 +161,10 @@ constexpr void test_iterators() {
   }
 
   { // range and subrange both have zero length
-    int a[]       = {};
-    int p[]       = {};
-    auto whole    = std::ranges::subrange(Iter1(a), Sent1(Iter1(a)));
-    auto subrange = std::ranges::subrange(Iter2(p), Sent2(Iter2(p)));
+    std::array<int, 0> a = {};
+    std::array<int, 0> p = {};
+    auto whole           = std::ranges::subrange(Iter1(a.data()), Sent1(Iter1(a.data())));
+    auto subrange        = std::ranges::subrange(Iter2(p.data()), Sent2(Iter2(p.data())));
     {
       bool ret = std::ranges::contains_subrange(whole.begin(), whole.end(), subrange.begin(), subrange.end());
       assert(ret);
diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
index 72b2f444c476c..90aa5ea5b6df4 100644
--- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
@@ -9,6 +9,9 @@
 // XFAIL: !has-64-bit-atomics
 // XFAIL: !has-1024-bit-atomics
 
+// MSVC warning C4310: cast truncates constant value
+// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310
+
 // bool compare_exchange_strong(T&, T, memory_order, memory_order) const noexcept;
 // bool compare_exchange_strong(T&, T, memory_order = memory_order::seq_cst) const noexcept;
 
diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
index 5219a8e3714f9..99c1385a2fe0b 100644
--- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
@@ -9,6 +9,9 @@
 // XFAIL: !has-64-bit-atomics
 // XFAIL: !has-1024-bit-atomics
 
+// MSVC warning C4310: cast truncates constant value
+// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310
+
 // bool compare_exchange_weak(T&, T, memory_order, memory_order) const noexcept;
 // bool compare_exchange_weak(T&, T, memory_order = memory_order::seq_cst) const noexcept;
 
diff --git a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
index e5310febf5c5e..f246803ba2592 100644
--- a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
@@ -11,6 +11,9 @@
 // XFAIL: !has-64-bit-atomics
 // XFAIL: !has-1024-bit-atomics
 
+// MSVC warning C4310: cast truncates constant value
+// ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4310
+
 // void wait(T, memory_order = memory_order::seq_cst) const noexcept;
 
 #include <atomic>
diff --git a/libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp b/libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp
new file mode 100644
index 0000000000000..8eed20990cc00
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/array/lwg3382.compile.pass.cpp
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <array>
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <array>
+
+template <auto>
+struct Test {};
+
+void test() {
+  // LWG 3382. NTTP for pair and array
+  // https://cplusplus.github.io/LWG/issue3382
+  constexpr std::array<int, 5> a{};
+  [[maybe_unused]] Test<a> test1{};
+
+  constexpr std::array<int, 0> b{};
+  [[maybe_unused]] Test<b> test2{};
+}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/destory_elements.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp
similarity index 100%
rename from libcxx/test/std/containers/sequences/vector/vector.modifiers/destory_elements.pass.cpp
rename to libcxx/test/std/containers/sequences/vector/vector.modifiers/destroy_elements.pass.cpp
diff --git a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
index 74a5094f61261..bc76e23fea3c0 100644
--- a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
@@ -93,9 +93,9 @@ constexpr bool test() {
 
 // Test P2447R4 "Annex C examples"
 
-constexpr int three(std::span<void* const> sp) { return sp.size(); }
+constexpr int three(std::span<void* const> sp) { return static_cast<int>(sp.size()); }
 
-constexpr int four(std::span<const std::any> sp) { return sp.size(); }
+constexpr int four(std::span<const std::any> sp) { return static_cast<int>(sp.size()); }
 
 bool test_P2447R4_annex_c_examples() {
   // 1. Overload resolution is affected
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
index d4bbde75ae882..7283fdc769d86 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
@@ -50,13 +50,16 @@ int main(int, char**)
         // responds with an empty message, which we probably want to
         // treat as a failure code otherwise, but we can detect that
         // with the preprocessor.
+#if defined(_NEWLIB_VERSION)
+        const bool is_newlib = true;
+#else
+        const bool is_newlib = false;
+#endif
+        (void)is_newlib;
         LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0       // AIX
                       || msg.rfind("No error information", 0) == 0 // Musl
                       || msg.rfind("Unknown error", 0) == 0        // Glibc
-#if defined(_NEWLIB_VERSION)
-                      || msg.empty()
-#endif
-        );
+                      || (is_newlib && msg.empty()));
         assert(errno == E2BIG);
     }
 
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
index eefbddd27a7f5..02a1baf599983 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
@@ -56,13 +56,16 @@ int main(int, char**) {
     // responds with an empty message, which we probably want to
     // treat as a failure code otherwise, but we can detect that
     // with the preprocessor.
+#if defined(_NEWLIB_VERSION)
+    const bool is_newlib = true;
+#else
+    const bool is_newlib = false;
+#endif
+    (void)is_newlib;
     LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0       // AIX
                   || msg.rfind("No error information", 0) == 0 // Musl
                   || msg.rfind("Unknown error", 0) == 0        // Glibc
-#if defined(_NEWLIB_VERSION)
-                  || msg.empty()
-#endif
-    );
+                  || (is_newlib && msg.empty()));
     assert(errno == E2BIG);
   }
 
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp
index 5edf22eaacf31..d6bb56d9b78b7 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/fstream.cons/path.pass.cpp
@@ -37,7 +37,7 @@ constexpr bool test_non_convert_to_path() {
   static_assert(!std::is_constructible_v<std::fstream, const std::basic_string_view<CharT>>);
 
   // Char* pointers
-  if constexpr (!std::is_same_v<CharT, char>)
+  if constexpr (!std::is_same_v<CharT, char> && !std::is_same_v<CharT, fs::path::value_type>)
     static_assert(!std::is_constructible_v<std::fstream, const CharT*>);
 
   // Iterators
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp
index 2f27fd8e6e93d..792b65615679a 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp
@@ -38,7 +38,7 @@ constexpr bool test_non_convert_to_path() {
   static_assert(!std::is_constructible_v<std::ifstream, const std::basic_string_view<CharT>>);
 
   // Char* pointers
-  if constexpr (!std::is_same_v<CharT, char>)
+  if constexpr (!std::is_same_v<CharT, char> && !std::is_same_v<CharT, fs::path::value_type>)
     static_assert(!std::is_constructible_v<std::ifstream, const CharT*>);
 
   // Iterators
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp
index e55adfd83fc3c..602bdadd85813 100644
--- a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.cons/path.pass.cpp
@@ -37,7 +37,7 @@ constexpr bool test_non_convert_to_path() {
   static_assert(!std::is_constructible_v<std::ofstream, const std::basic_string_view<CharT>>);
 
   // Char* pointers
-  if constexpr (!std::is_same_v<CharT, char>)
+  if constexpr (!std::is_same_v<CharT, char> && !std::is_same_v<CharT, fs::path::value_type>)
     static_assert(!std::is_constructible_v<std::ofstream, const CharT*>);
 
   // Iterators
diff --git a/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp b/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp
index cb49086dd6802..998b13ed49455 100644
--- a/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/move.iterators/sized_sentinel.compile.pass.cpp
@@ -21,6 +21,7 @@ struct unsized_it {
   using difference_type = std::ptrdiff_t;
 
   value_type& operator*() const;
+  unsized_it& operator++();
   bool operator==(const unsized_it&) const;
   difference_type operator-(const unsized_it&) const { return 0; }
 };
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
index 308cc2d43b058..4aba33482f69c 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp
@@ -20,7 +20,7 @@
 /*  Constant                 Value
     __cpp_lib_char8_t        201907L [C++20]
     __cpp_lib_filesystem     201703L [C++17]
-    __cpp_lib_format_path    202403L [C++23]
+    __cpp_lib_format_path    202403L [C++26]
 */
 
 #include <filesystem>
@@ -37,7 +37,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 14
@@ -51,7 +51,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 17
@@ -74,7 +74,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 20
@@ -106,7 +106,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 23
@@ -137,17 +137,8 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should be defined in c++23"
-#   endif
-#   if __cpp_lib_format_path != 202403L
-#     error "__cpp_lib_format_path should have the value 202403L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_format_path
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
index 16a9a0a28de63..af6386a40a458 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp
@@ -29,7 +29,7 @@
     __cpp_lib_string_udls                                   201304L [C++14]
     __cpp_lib_string_view                                   201606L [C++17]
                                                             201803L [C++20]
-    __cpp_lib_to_string                                     202306L [C++23]
+    __cpp_lib_to_string                                     202306L [C++26]
 */
 
 #include <string>
@@ -86,7 +86,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 14
@@ -143,7 +143,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 17
@@ -209,7 +209,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 20
@@ -293,7 +293,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER == 23
@@ -385,17 +385,8 @@
 #   error "__cpp_lib_string_view should have the value 201803L in c++23"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should be defined in c++23"
-#   endif
-#   if __cpp_lib_to_string != 202306L
-#     error "__cpp_lib_to_string should have the value 202306L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_to_string
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 #elif TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 7829e06f90760..c1e1f9f340af4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -88,7 +88,7 @@
     __cpp_lib_expected                                      202211L [C++23]
     __cpp_lib_filesystem                                    201703L [C++17]
     __cpp_lib_format                                        202106L [C++20]
-    __cpp_lib_format_path                                   202403L [C++23]
+    __cpp_lib_format_path                                   202403L [C++26]
     __cpp_lib_format_ranges                                 202207L [C++23]
     __cpp_lib_format_uchar                                  202311L [C++20]
     __cpp_lib_formatters                                    202302L [C++23]
@@ -216,7 +216,7 @@
     __cpp_lib_to_array                                      201907L [C++20]
     __cpp_lib_to_chars                                      201611L [C++17]
                                                             202306L [C++26]
-    __cpp_lib_to_string                                     202306L [C++23]
+    __cpp_lib_to_string                                     202306L [C++26]
     __cpp_lib_to_underlying                                 202102L [C++23]
     __cpp_lib_transformation_trait_aliases                  201304L [C++14]
     __cpp_lib_transparent_operators                         201210L [C++14]
@@ -513,7 +513,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -1005,7 +1005,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -1348,7 +1348,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -1891,7 +1891,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -2303,7 +2303,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -2972,7 +2972,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -3543,7 +3543,7 @@
 # endif
 
 # ifdef __cpp_lib_format_path
-#   error "__cpp_lib_format_path should not be defined before c++23"
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_format_ranges
@@ -4350,7 +4350,7 @@
 # endif
 
 # ifdef __cpp_lib_to_string
-#   error "__cpp_lib_to_string should not be defined before c++23"
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifdef __cpp_lib_to_underlying
@@ -4971,17 +4971,8 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should be defined in c++23"
-#   endif
-#   if __cpp_lib_format_path != 202403L
-#     error "__cpp_lib_format_path should have the value 202403L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_format_path
-#     error "__cpp_lib_format_path should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_format_path
+#   error "__cpp_lib_format_path should not be defined before c++26"
 # endif
 
 # ifndef __cpp_lib_format_ranges
@@ -5943,17 +5934,8 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should be defined in c++23"
-#   endif
-#   if __cpp_lib_to_string != 202306L
-#     error "__cpp_lib_to_string should have the value 202306L in c++23"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_to_string
-#     error "__cpp_lib_to_string should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifdef __cpp_lib_to_string
+#   error "__cpp_lib_to_string should not be defined before c++26"
 # endif
 
 # ifndef __cpp_lib_to_underlying
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
index 212804356a056..6a9ec1a2ffec2 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <climits>
 #include <cstdint>
+#include <limits>
 #include <random>
 #include <type_traits>
 
@@ -57,22 +58,26 @@ T basic_gcd_(T m, T n) {
 template <typename T>
 T basic_gcd(T m, T n) {
   using Tp = std::make_unsigned_t<T>;
-  if (m < 0 && m != std::numeric_limits<T>::min())
-    m = -m;
-  if (n < 0 && n != std::numeric_limits<T>::min())
-    n = -n;
+  if constexpr (std::is_signed_v<T>) {
+    if (m < 0 && m != std::numeric_limits<T>::min())
+      m = -m;
+    if (n < 0 && n != std::numeric_limits<T>::min())
+      n = -n;
+  }
   return basic_gcd_(static_cast<Tp>(m), static_cast<Tp>(n));
 }
 
 template <typename Input>
 void do_fuzzy_tests() {
   std::mt19937 gen(1938);
-  std::uniform_int_distribution<Input> distrib;
+  using DistIntType         = std::conditional_t<sizeof(Input) == 1, int, Input>; // See N4981 [rand.req.genl]/1.5
+  constexpr Input max_input = std::numeric_limits<Input>::max();
+  std::uniform_int_distribution<DistIntType> distrib(0, max_input);
 
   constexpr int nb_rounds = 10000;
   for (int i = 0; i < nb_rounds; ++i) {
-    Input n = distrib(gen);
-    Input m = distrib(gen);
+    Input n = static_cast<Input>(distrib(gen));
+    Input m = static_cast<Input>(distrib(gen));
     assert(std::gcd(n, m) == basic_gcd(n, m));
   }
 }
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
index 2c43e121613c7..f31a679dd6214 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/current_zone.pass.cpp
@@ -32,7 +32,7 @@ static void set_tz(std::string zone) {
   // Unlike POSIX it does not mention the string of putenv becomes part
   // of the environment.
 
-  int status = _putenv_s("TZ", zone.c_str(), 1);
+  int status = _putenv_s("TZ", zone.c_str());
   assert(status == 0);
 }
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
index 4d600fcdf40e3..8dd895fd21814 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/locate_zone.pass.cpp
@@ -40,7 +40,7 @@ static void test_exception([[maybe_unused]] std::string_view zone) {
   TEST_VALIDATE_EXCEPTION(
       std::runtime_error,
       [&]([[maybe_unused]] const std::runtime_error& e) {
-        std::string_view what{"tzdb: requested time zone not found"};
+        [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"};
         TEST_LIBCPP_REQUIRE(
             e.what() == what,
             TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
index e6497e26323ce..98509c298ebcb 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/current_zone.pass.cpp
@@ -34,7 +34,7 @@ static void set_tz(std::string zone) {
   // Unlike POSIX it does not mention the string of putenv becomes part
   // of the environment.
 
-  int status = _putenv_s("TZ", zone.c_str(), 1);
+  int status = _putenv_s("TZ", zone.c_str());
   assert(status == 0);
 }
 
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
index f929dafcc9683..08ce48dfd0edb 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.tzdb/locate_zone.pass.cpp
@@ -42,7 +42,7 @@ static void test_exception([[maybe_unused]] std::string_view zone) {
   TEST_VALIDATE_EXCEPTION(
       std::runtime_error,
       [&]([[maybe_unused]] const std::runtime_error& e) {
-        std::string_view what{"tzdb: requested time zone not found"};
+        [[maybe_unused]] std::string_view what{"tzdb: requested time zone not found"};
         TEST_LIBCPP_REQUIRE(
             e.what() == what,
             TEST_WRITE_CONCATENATED("\nExpected exception ", what, "\nActual exception   ", e.what(), '\n'));
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
index ef43ab9b64b5b..381bcda761700 100644
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
@@ -118,10 +118,14 @@ int main(int, char**) {
 // Make sure we fail in a SFINAE-friendly manner when we try to deduce
 // from a type without a valid call operator.
 template <typename F, typename = decltype(std::function{std::declval<F>()})>
-constexpr bool can_deduce() { return true; }
+constexpr bool can_deduce_test(int) { return true; }
 template <typename F>
-constexpr bool can_deduce(...) { return false; }
+constexpr bool can_deduce_test(...) { return false; }
 
+template <typename F>
+constexpr bool can_deduce = can_deduce_test<F>(0);
+
+struct valid { int operator()() const; };
 struct invalid1 { };
 struct invalid2 {
   template <typename ...Args>
@@ -131,6 +135,22 @@ struct invalid3 {
   void operator()(int);
   void operator()(long);
 };
-static_assert(!can_deduce<invalid1>());
-static_assert(!can_deduce<invalid2>());
-static_assert(!can_deduce<invalid3>());
+static_assert( can_deduce<valid>);
+static_assert(!can_deduce<invalid1>);
+static_assert(!can_deduce<invalid2>);
+static_assert(!can_deduce<invalid3>);
+
+
+// LWG 3238. Insufficiently-defined behavior of std::function deduction guides
+// https://cplusplus.github.io/LWG/issue3238
+// The deduction guides for std::function do not handle rvalue-ref qualified
+// call operators and C-style variadics. It also doesn't deduce from nullptr_t.
+// Make sure we stick to the specification.
+
+struct invalid_rvalue_ref { R operator()() && { return {}; } };
+struct invalid_c_vararg { R operator()(int, ...) { return {}; } };
+
+static_assert(!can_deduce<invalid_rvalue_ref>);
+static_assert(!can_deduce<invalid_c_vararg>);
+static_assert(!can_deduce<std::nullptr_t>);
+
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp
deleted file mode 100644
index 8a42d3be3571c..0000000000000
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.verify.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <functional>
-
-// template<class F>
-// function(F) -> function<see-below>;
-
-// UNSUPPORTED: c++03, c++11, c++14
-
-// The deduction guides for std::function do not handle rvalue-ref qualified
-// call operators and C-style variadics. It also doesn't deduce from nullptr_t.
-// Make sure we stick to the specification.
-
-#include <functional>
-
-struct R { };
-struct f0 { R operator()() && { return {}; } };
-struct f1 { R operator()(int, ...) { return {}; } };
-
-void f() {
-    std::function f = f0{}; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'function'}}
-    std::function g = f1{}; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'function'}}
-    std::function h = nullptr; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'function'}}
-}
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
index 49497b6956b9f..13340ed5294c0 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter.pass.cpp
@@ -17,6 +17,7 @@
 #include "test_macros.h"
 #include "deleter_types.h"
 
+#include "types.h"
 struct A
 {
     static int count;
@@ -28,6 +29,25 @@ struct A
 
 int A::count = 0;
 
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
+static_assert( std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, bad_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_nullptr_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_move_deleter>::value, "");
+
+#if TEST_STD_VER >= 17
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, bad_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_nullptr_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_move_deleter>::value, "");
+
+static_assert( std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, test_deleter<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, bad_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_nullptr_deleter>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_move_deleter>::value, "");
+#endif
+
 int main(int, char**)
 {
     {
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
index 4e9fc227b99e8..53ca6fb5b234d 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_allocator.pass.cpp
@@ -17,6 +17,8 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
+#include "types.h"
+
 struct A
 {
     static int count;
@@ -28,6 +30,25 @@ struct A
 
 int A::count = 0;
 
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
+static_assert( std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
+
+#if TEST_STD_VER >= 17
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
+
+static_assert( std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_nullptr_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>,  std::nullptr_t, no_move_deleter, test_allocator<int> >::value, "");
+#endif
+
 int main(int, char**)
 {
     test_allocator_statistics alloc_stats;
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
index 42225a4b0be7e..9c1e9b72be573 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter.pass.cpp
@@ -17,6 +17,8 @@
 #include "test_macros.h"
 #include "deleter_types.h"
 
+#include "types.h"
+
 struct A
 {
     static int count;
@@ -28,38 +30,8 @@ struct A
 
 int A::count = 0;
 
-struct bad_ty { };
-
-struct bad_deleter
-{
-    void operator()(bad_ty) { }
-};
-
-struct no_move_deleter
-{
-    no_move_deleter(no_move_deleter const&) = delete;
-    no_move_deleter(no_move_deleter &&) = delete;
-    void operator()(int*) { }
-};
-
-static_assert(!std::is_move_constructible<no_move_deleter>::value, "");
-
-struct Base { };
-struct Derived : Base { };
-
-template<class T>
-class MoveDeleter
-{
-    MoveDeleter();
-    MoveDeleter(MoveDeleter const&);
-public:
-  MoveDeleter(MoveDeleter&&) {}
-
-  explicit MoveDeleter(int) {}
-
-  void operator()(T* ptr) { delete ptr; }
-};
-
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
 // https://llvm.org/PR60258
 // Invalid constructor SFINAE for std::shared_ptr's array ctors
 static_assert( std::is_constructible<std::shared_ptr<int>,  int*, test_deleter<int> >::value, "");
@@ -68,12 +40,12 @@ static_assert( std::is_constructible<std::shared_ptr<Base>,  Derived*, test_dele
 static_assert(!std::is_constructible<std::shared_ptr<A>,  int*, test_deleter<A> >::value, "");
 
 #if TEST_STD_VER >= 17
-static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int>>::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int*, bad_deleter>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int>>::value, "");
-static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int>>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int> >::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int> >::value, "");
 static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int*, bad_deleter>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int>>::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int> >::value, "");
 #endif
 
 int main(int, char**)
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
index a110525b9b922..9dffbcdd59a73 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_allocator.pass.cpp
@@ -17,6 +17,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
+#include "types.h"
 struct A
 {
     static int count;
@@ -28,38 +29,8 @@ struct A
 
 int A::count = 0;
 
-struct bad_ty { };
-
-struct bad_deleter
-{
-    void operator()(bad_ty) { }
-};
-
-struct no_move_deleter
-{
-    no_move_deleter(no_move_deleter const&) = delete;
-    no_move_deleter(no_move_deleter &&) = delete;
-    void operator()(int*) { }
-};
-
-static_assert(!std::is_move_constructible<no_move_deleter>::value, "");
-
-struct Base { };
-struct Derived : Base { };
-
-template<class T>
-class MoveDeleter
-{
-    MoveDeleter();
-    MoveDeleter(MoveDeleter const&);
-public:
-  MoveDeleter(MoveDeleter&&) {}
-
-  explicit MoveDeleter(int) {}
-
-  void operator()(T* ptr) { delete ptr; }
-};
-
+// LWG 3233. Broken requirements for shared_ptr converting constructors
+// https://cplusplus.github.io/LWG/issue3233
 // https://llvm.org/PR60258
 // Invalid constructor SFINAE for std::shared_ptr's array ctors
 static_assert( std::is_constructible<std::shared_ptr<int>,  int*, test_deleter<int>, test_allocator<int> >::value, "");
@@ -68,12 +39,12 @@ static_assert( std::is_constructible<std::shared_ptr<Base>,  Derived*, test_dele
 static_assert(!std::is_constructible<std::shared_ptr<A>,  int*, test_deleter<A>, test_allocator<A> >::value, "");
 
 #if TEST_STD_VER >= 17
-static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int>, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int*, bad_deleter, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int>, test_allocator<int>>::value, "");
-static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int>, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int*, bad_deleter, test_allocator<int>>::value, "");
-static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int>, test_allocator<int>>::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[]>,  int*, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int*, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[]>,  int(*)[], test_deleter<int>, test_allocator<int> >::value, "");
+static_assert( std::is_constructible<std::shared_ptr<int[5]>, int*, test_deleter<int>, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int*, bad_deleter, test_allocator<int> >::value, "");
+static_assert(!std::is_constructible<std::shared_ptr<int[5]>, int(*)[5], test_deleter<int>, test_allocator<int> >::value, "");
 #endif
 
 
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h
new file mode 100644
index 0000000000000..5bfb3d70febea
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/types.h
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H
+#define TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H
+
+#include <type_traits>
+
+struct bad_ty {};
+
+struct bad_deleter {
+  void operator()(bad_ty) {}
+};
+
+struct no_move_deleter {
+  no_move_deleter(no_move_deleter const&) = delete;
+  no_move_deleter(no_move_deleter&&)      = delete;
+  void operator()(int*) {}
+};
+
+static_assert(!std::is_move_constructible<no_move_deleter>::value, "");
+
+struct no_nullptr_deleter {
+  void operator()(int*) const {}
+  void operator()(std::nullptr_t) const = delete;
+};
+
+struct Base {};
+struct Derived : Base {};
+
+template <class T>
+class MoveDeleter {
+  MoveDeleter();
+  MoveDeleter(MoveDeleter const&);
+
+public:
+  MoveDeleter(MoveDeleter&&) {}
+
+  explicit MoveDeleter(int) {}
+
+  void operator()(T* ptr) { delete ptr; }
+};
+
+#endif // TEST_STD_UTILITIES_MEMORY_UTIL_SMARTPTR_SHARED_CONST_TYPES_H
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
index d7374351afa8b..accb601dd0036 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.pass.cpp
@@ -209,6 +209,7 @@ template <class T, class Tuple>
 static constexpr bool can_make_from_tuple =
     std::is_same_v<decltype(test_make_from_tuple<T, Tuple>(T{}, Tuple{})), uint8_t>;
 
+#ifdef _LIBCPP_VERSION
 template <class T, class Tuple>
 auto test_make_from_tuple_impl(T&&, Tuple&& t)
     -> decltype(std::__make_from_tuple_impl<T>(
@@ -224,6 +225,7 @@ uint32_t test_make_from_tuple_impl(...) {
 template <class T, class Tuple>
 static constexpr bool can_make_from_tuple_impl =
     std::is_same_v<decltype(test_make_from_tuple_impl<T, Tuple>(T{}, Tuple{})), uint8_t>;
+#endif // _LIBCPP_VERSION
 
 struct A {
   int a;
@@ -263,23 +265,23 @@ static_assert(can_make_from_tuple<float, std::tuple<double>>);
 // Test std::__make_from_tuple_impl constraints.
 
 // reinterpret_cast
-static_assert(!can_make_from_tuple_impl<int*, std::tuple<A*>>);
-static_assert(can_make_from_tuple_impl<A*, std::tuple<A*>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<int*, std::tuple<A*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<A*, std::tuple<A*>>);
 
 // const_cast
-static_assert(!can_make_from_tuple_impl<char*, std::tuple<const char*>>);
-static_assert(!can_make_from_tuple_impl<volatile char*, std::tuple<const volatile char*>>);
-static_assert(can_make_from_tuple_impl<volatile char*, std::tuple<volatile char*>>);
-static_assert(can_make_from_tuple_impl<char*, std::tuple<char*>>);
-static_assert(can_make_from_tuple_impl<const char*, std::tuple<char*>>);
-static_assert(can_make_from_tuple_impl<const volatile char*, std::tuple<volatile char*>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<char*, std::tuple<const char*>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<volatile char*, std::tuple<const volatile char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<volatile char*, std::tuple<volatile char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<char*, std::tuple<char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<const char*, std::tuple<char*>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<const volatile char*, std::tuple<volatile char*>>);
 
 // static_cast
-static_assert(!can_make_from_tuple_impl<int, std::tuple<D>>);
-static_assert(!can_make_from_tuple_impl<D, std::tuple<int>>);
-static_assert(can_make_from_tuple_impl<long, std::tuple<int>>);
-static_assert(can_make_from_tuple_impl<double, std::tuple<float>>);
-static_assert(can_make_from_tuple_impl<float, std::tuple<double>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<int, std::tuple<D>>);
+LIBCPP_STATIC_ASSERT(!can_make_from_tuple_impl<D, std::tuple<int>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<long, std::tuple<int>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<double, std::tuple<float>>);
+LIBCPP_STATIC_ASSERT(can_make_from_tuple_impl<float, std::tuple<double>>);
 
 } // namespace LWG3528
 
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp
new file mode 100644
index 0000000000000..dce9a5df220b2
--- /dev/null
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/lwg3382.compile.pass.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <utility>
+
+template <auto>
+struct Test {};
+
+void test() {
+  // LWG 3382. NTTP for pair and array
+  // https://cplusplus.github.io/LWG/issue3382
+#if !defined(_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR)
+  constexpr std::pair<int, long> a{};
+  [[maybe_unused]] Test<a> test1{};
+#endif
+}
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
index db05691c55818..039a2373348c4 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
@@ -516,7 +516,7 @@ constexpr void test_swap_sfinae() {
   }
 }
 
-_LIBCPP_CONSTEXPR_SINCE_CXX20 void test_swap_noexcept() {
+TEST_CONSTEXPR_CXX20 void test_swap_noexcept() {
   {
     using V = std::variant<int, NothrowMoveable>;
     static_assert(std::is_swappable_v<V> && has_swap_member<V>(), "");
diff --git a/libcxx/test/support/msvc_stdlib_force_include.h b/libcxx/test/support/msvc_stdlib_force_include.h
index 6c26085e72c45..785670224c3b1 100644
--- a/libcxx/test/support/msvc_stdlib_force_include.h
+++ b/libcxx/test/support/msvc_stdlib_force_include.h
@@ -67,7 +67,6 @@ const AssertionDialogAvoider assertion_dialog_avoider{};
 // Silence compiler warnings.
 #  pragma warning(disable : 4180)  // qualifier applied to function type has no meaning; ignored
 #  pragma warning(disable : 4324)  // structure was padded due to alignment specifier
-#  pragma warning(disable : 4521)  // multiple copy constructors specified
 #  pragma warning(disable : 4702)  // unreachable code
 #  pragma warning(disable : 28251) // Inconsistent annotation for 'new': this instance has no annotations.
 #endif                             // !defined(__clang__)
@@ -91,7 +90,7 @@ const AssertionDialogAvoider assertion_dialog_avoider{};
 #include <version>
 
 #if _HAS_CXX23
-#  define TEST_STD_VER 99
+#  define TEST_STD_VER 23
 #elif _HAS_CXX20
 #  define TEST_STD_VER 20
 #elif _HAS_CXX17
diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
index 28c1dbf8aca3c..f0289dc44c662 100644
--- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
+++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
@@ -110,3 +110,5 @@ set_target_properties(cxx-tidy PROPERTIES
 
 set_target_properties(cxx-tidy PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 set(CMAKE_SHARED_MODULE_SUFFIX_CXX .plugin) # Use a portable suffix to simplify how we can find it from Lit
+
+add_dependencies(cxx-test-depends cxx-tidy)
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index b04cb4f511554..490ecefc97522 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -61,7 +61,8 @@ def add_version_header(tc):
 #                   just libc++. It may depend on
 #                    * macros defined by the compiler itself, or
 #                    * macros generated by CMake.
-#                   In some cases we add also depend on macros defined in <__availability>.
+#                   In some cases we add also depend on macros defined in
+#                   <__configuration/availability.h>.
 # libcxx_guard      An optional string field. When this field is provided,
 #                   `test_suite_guard` must also be provided. This field is used
 #                   only to guard the feature-test macro in <version>. It may
@@ -515,7 +516,7 @@ def add_version_header(tc):
         },
         {
             "name": "__cpp_lib_format_path",
-            "values": {"c++23": 202403}, # P2845R8: Formatting of std::filesystem::path
+            "values": {"c++26": 202403},  # P2845R8: Formatting of std::filesystem::path
             "headers": ["filesystem"],
             "unimplemented": True,
         },
@@ -1270,7 +1271,7 @@ def add_version_header(tc):
         },
         {
             "name": "__cpp_lib_to_string",
-            "values": {"c++23": 202306},  # P2587R3 to_string or not to_string
+            "values": {"c++26": 202306},  # P2587R3 to_string or not to_string
             "headers": ["string"],
             "unimplemented": True,
         },
@@ -1562,7 +1563,6 @@ def produce_version_header():
 
 */
 
-#include <__availability>
 #include <__config>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index c2d294e49f488..4c8590a2135d9 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -143,6 +143,14 @@ def getSuitableClangTidy(cfg):
 
 # fmt: off
 DEFAULT_PARAMETERS = [
+    Parameter(
+        name="compiler",
+        type=str,
+        help="The path of the compiler to use for testing.",
+        actions=lambda cxx: [
+            AddSubstitution("%{cxx}", shlex.quote(cxx)),
+        ],
+    ),
     Parameter(
         name="target_triple",
         type=str,
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index f7673da25d20e..86fe4a604f30d 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -443,6 +443,9 @@ if (NOT "${LIBCXXABI_LIBUNWIND_INCLUDES_INTERNAL}" STREQUAL "")
   include_directories("${LIBCXXABI_LIBUNWIND_INCLUDES_INTERNAL}")
 endif()
 
+add_custom_target(cxxabi-test-depends
+  COMMENT "Build dependencies required to run the libc++abi test suite.")
+
 # Add source code. This also contains all of the logic for deciding linker flags
 # soname, etc...
 add_subdirectory(include)
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index c8cc93de50777..c54ced4dc3ea8 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -304,6 +304,7 @@ endif()
 
 # Add a meta-target for both libraries.
 add_custom_target(cxxabi DEPENDS ${LIBCXXABI_BUILD_TARGETS})
+add_dependencies(cxxabi-test-depends cxxabi cxx)
 
 if (LIBCXXABI_INSTALL_LIBRARY)
   install(TARGETS ${LIBCXXABI_INSTALL_TARGETS}
diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt
index 586927189cf1d..8e3048f2ffe8a 100644
--- a/libcxxabi/test/CMakeLists.txt
+++ b/libcxxabi/test/CMakeLists.txt
@@ -10,20 +10,11 @@ endmacro()
 
 pythonize_bool(LIBCXXABI_USE_LLVM_UNWINDER)
 
-if (LIBCXXABI_ENABLE_SHARED)
-  set(LIBCXXABI_TEST_DEPS cxxabi_shared)
-else()
-  set(LIBCXXABI_TEST_DEPS cxxabi_static)
-endif()
-
-list(APPEND LIBCXXABI_TEST_DEPS cxx)
-if (LIBCXXABI_USE_LLVM_UNWINDER AND TARGET unwind)
-  list(APPEND LIBCXXABI_TEST_DEPS unwind)
-endif()
-
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxxabi configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
+serialize_lit_string_param(SERIALIZED_LIT_PARAMS compiler "${CMAKE_CXX_COMPILER}")
+
 if (NOT LIBCXXABI_ENABLE_EXCEPTIONS)
   serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False)
 endif()
@@ -57,4 +48,4 @@ configure_lit_site_cfg(
 
 add_lit_testsuite(check-cxxabi "Running libcxxabi tests"
   ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS ${LIBCXXABI_TEST_DEPS})
+  DEPENDS cxxabi-test-depends)
diff --git a/libcxxabi/test/configs/cmake-bridge.cfg.in b/libcxxabi/test/configs/cmake-bridge.cfg.in
index 1d0f51d37437b..3fefc6a7fdc88 100644
--- a/libcxxabi/test/configs/cmake-bridge.cfg.in
+++ b/libcxxabi/test/configs/cmake-bridge.cfg.in
@@ -26,7 +26,6 @@ config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test')
 # TODO: This is a non-standard Lit attribute and we should have another way of accessing this.
 config.host_triple = '@LLVM_HOST_TRIPLE@'
 
-config.substitutions.append(('%{cxx}', '@CMAKE_CXX_COMPILER@'))
 config.substitutions.append(('%{libcxx}', '@LIBCXXABI_LIBCXX_PATH@'))
 config.substitutions.append(('%{include}', '@LIBCXXABI_SOURCE_DIR@/include'))
 config.substitutions.append(('%{cxx-include}', '@LIBCXXABI_HEADER_DIR@/include/c++/v1'))
diff --git a/libunwind/test/CMakeLists.txt b/libunwind/test/CMakeLists.txt
index 21dfbb0a84f0a..19f055f6f93ff 100644
--- a/libunwind/test/CMakeLists.txt
+++ b/libunwind/test/CMakeLists.txt
@@ -15,6 +15,8 @@ pythonize_bool(LIBUNWIND_USES_ARM_EHABI)
 set(AUTO_GEN_COMMENT "## Autogenerated by libunwind configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
+serialize_lit_string_param(SERIALIZED_LIT_PARAMS compiler "${CMAKE_CXX_COMPILER}")
+
 if (LIBUNWIND_EXECUTOR)
   message(DEPRECATION "LIBUNWIND_EXECUTOR is deprecated, please add executor=... to LIBUNWIND_TEST_PARAMS")
   serialize_lit_string_param(SERIALIZED_LIT_PARAMS executor "${LIBUNWIND_EXECUTOR}")
@@ -45,4 +47,4 @@ configure_lit_site_cfg(
 
 add_lit_testsuite(check-unwind "Running libunwind tests"
   ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS unwind ${LIBUNWIND_TEST_DEPS})
+  DEPENDS unwind)
diff --git a/libunwind/test/configs/cmake-bridge.cfg.in b/libunwind/test/configs/cmake-bridge.cfg.in
index c5f34c87abb92..7fc7a3da42462 100644
--- a/libunwind/test/configs/cmake-bridge.cfg.in
+++ b/libunwind/test/configs/cmake-bridge.cfg.in
@@ -29,7 +29,5 @@ if not @LIBUNWIND_ENABLE_THREADS@:
     config.available_features.add('libunwind-no-threads')
 
 # Add substitutions for bootstrapping the test suite configuration
-import shlex
-config.substitutions.append(('%{cxx}', shlex.quote('@CMAKE_CXX_COMPILER@')))
 config.substitutions.append(('%{include}', '@LIBUNWIND_SOURCE_DIR@/include'))
 config.substitutions.append(('%{lib}', '@LIBUNWIND_LIBRARY_DIR@'))
diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
index 9d1612beae872..6e857cfcd92f6 100644
--- a/lld/MachO/ObjC.cpp
+++ b/lld/MachO/ObjC.cpp
@@ -379,12 +379,21 @@ class ObjcCategoryMerger {
     InfoWriteSection catPtrListInfo;
   };
 
-  // Information about a pointer list in the original categories (method lists,
-  // protocol lists, etc)
+  // Information about a pointer list in the original categories or class(method
+  // lists, protocol lists, etc)
   struct PointerListInfo {
+    PointerListInfo() = default;
+    PointerListInfo(const PointerListInfo &) = default;
     PointerListInfo(const char *_categoryPrefix, uint32_t _pointersPerStruct)
         : categoryPrefix(_categoryPrefix),
           pointersPerStruct(_pointersPerStruct) {}
+
+    inline bool operator==(const PointerListInfo &cmp) const {
+      return pointersPerStruct == cmp.pointersPerStruct &&
+             structSize == cmp.structSize && structCount == cmp.structCount &&
+             allPtrs == cmp.allPtrs;
+    }
+
     const char *categoryPrefix;
 
     uint32_t pointersPerStruct = 0;
@@ -395,9 +404,9 @@ class ObjcCategoryMerger {
     std::vector<Symbol *> allPtrs;
   };
 
-  // Full information about all the categories that extend a class. This will
-  // include all the additional methods, protocols, and properties that are
-  // contained in all the categories that extend a particular class.
+  // Full information describing an ObjC class . This will include all the
+  // additional methods, protocols, and properties that are contained in the
+  // class and all the categories that extend a particular class.
   struct ClassExtensionInfo {
     ClassExtensionInfo(CategoryLayout &_catLayout) : catLayout(_catLayout){};
 
@@ -449,6 +458,9 @@ class ObjcCategoryMerger {
   void parseProtocolListInfo(const ConcatInputSection *isec, uint32_t secOffset,
                              PointerListInfo &ptrList);
 
+  PointerListInfo parseProtocolListInfo(const ConcatInputSection *isec,
+                                        uint32_t secOffset);
+
   void parsePointerListInfo(const ConcatInputSection *isec, uint32_t secOffset,
                             PointerListInfo &ptrList);
 
@@ -456,9 +468,9 @@ class ObjcCategoryMerger {
                               const ClassExtensionInfo &extInfo,
                               const PointerListInfo &ptrList);
 
-  void emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset,
-                               const ClassExtensionInfo &extInfo,
-                               const PointerListInfo &ptrList);
+  Defined *emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset,
+                                   const ClassExtensionInfo &extInfo,
+                                   const PointerListInfo &ptrList);
 
   Defined *emitCategory(const ClassExtensionInfo &extInfo);
   Defined *emitCatListEntrySec(const std::string &forCategoryName,
@@ -474,6 +486,10 @@ class ObjcCategoryMerger {
                                    uint32_t offset);
   Defined *tryGetDefinedAtIsecOffset(const ConcatInputSection *isec,
                                      uint32_t offset);
+  Defined *getClassRo(const Defined *classSym, bool getMetaRo);
+  void mergeCategoriesIntoBaseClass(const Defined *baseClass,
+                                    std::vector<InfoInputCategory> &categories);
+  void eraseSymbolAtIsecOffset(ConcatInputSection *isec, uint32_t offset);
   void tryEraseDefinedAtIsecOffset(const ConcatInputSection *isec,
                                    uint32_t offset);
 
@@ -552,6 +568,29 @@ ObjcCategoryMerger::tryGetDefinedAtIsecOffset(const ConcatInputSection *isec,
   return dyn_cast_or_null<Defined>(sym);
 }
 
+// Get the class's ro_data symbol. If getMetaRo is true, then we will return
+// the meta-class's ro_data symbol. Otherwise, we will return the class
+// (instance) ro_data symbol.
+Defined *ObjcCategoryMerger::getClassRo(const Defined *classSym,
+                                        bool getMetaRo) {
+  ConcatInputSection *isec = dyn_cast<ConcatInputSection>(classSym->isec());
+  if (!isec)
+    return nullptr;
+
+  if (!getMetaRo)
+    return tryGetDefinedAtIsecOffset(isec, classLayout.roDataOffset +
+                                               classSym->value);
+
+  Defined *metaClass = tryGetDefinedAtIsecOffset(
+      isec, classLayout.metaClassOffset + classSym->value);
+  if (!metaClass)
+    return nullptr;
+
+  return tryGetDefinedAtIsecOffset(
+      dyn_cast<ConcatInputSection>(metaClass->isec()),
+      classLayout.roDataOffset);
+}
+
 // Given an ConcatInputSection or CStringInputSection and an offset, if there is
 // a symbol(Defined) at that offset, then erase the symbol (mark it not live)
 void ObjcCategoryMerger::tryEraseDefinedAtIsecOffset(
@@ -663,6 +702,15 @@ void ObjcCategoryMerger::parseProtocolListInfo(const ConcatInputSection *isec,
          "Protocol list end offset does not match expected size");
 }
 
+// Parse a protocol list and return the PointerListInfo for it
+ObjcCategoryMerger::PointerListInfo
+ObjcCategoryMerger::parseProtocolListInfo(const ConcatInputSection *isec,
+                                          uint32_t secOffset) {
+  PointerListInfo ptrList;
+  parseProtocolListInfo(isec, secOffset, ptrList);
+  return ptrList;
+}
+
 // Parse a pointer list that might be linked to ConcatInputSection at a given
 // offset. This can be used for instance methods, class methods, instance props
 // and class props since they have the same format.
@@ -769,11 +817,11 @@ void ObjcCategoryMerger::parseCatInfoToExtInfo(const InfoInputCategory &catInfo,
 
 // Generate a protocol list (including header) and link it into the parent at
 // the specified offset.
-void ObjcCategoryMerger::emitAndLinkProtocolList(
+Defined *ObjcCategoryMerger::emitAndLinkProtocolList(
     Defined *parentSym, uint32_t linkAtOffset,
     const ClassExtensionInfo &extInfo, const PointerListInfo &ptrList) {
   if (ptrList.allPtrs.empty())
-    return;
+    return nullptr;
 
   assert(ptrList.allPtrs.size() == ptrList.structCount);
 
@@ -820,6 +868,8 @@ void ObjcCategoryMerger::emitAndLinkProtocolList(
                           infoCategoryWriter.catPtrListInfo.relocTemplate);
     offset += target->wordSize;
   }
+
+  return ptrListSym;
 }
 
 // Generate a pointer list (including header) and link it into the parent at the
@@ -1265,10 +1315,15 @@ void ObjcCategoryMerger::removeRefsToErasedIsecs() {
 void ObjcCategoryMerger::doMerge() {
   collectAndValidateCategoriesData();
 
-  for (auto &entry : categoryMap)
-    if (entry.second.size() > 1)
+  for (auto &[baseClass, catInfos] : categoryMap) {
+    if (auto *baseClassDef = dyn_cast<Defined>(baseClass)) {
+      // Merge all categories into the base class
+      mergeCategoriesIntoBaseClass(baseClassDef, catInfos);
+    } else if (catInfos.size() > 1) {
       // Merge all categories into a new, single category
-      mergeCategoriesIntoSingleCategory(entry.second);
+      mergeCategoriesIntoSingleCategory(catInfos);
+    }
+  }
 
   // Erase all categories that were merged
   eraseMergedCategories();
@@ -1302,3 +1357,101 @@ void objc::mergeCategories() {
 }
 
 void objc::doCleanup() { ObjcCategoryMerger::doCleanup(); }
+
+void ObjcCategoryMerger::mergeCategoriesIntoBaseClass(
+    const Defined *baseClass, std::vector<InfoInputCategory> &categories) {
+  assert(categories.size() >= 1 && "Expected at least one category to merge");
+
+  // Collect all the info from the categories
+  ClassExtensionInfo extInfo(catLayout);
+  for (auto &catInfo : categories) {
+    parseCatInfoToExtInfo(catInfo, extInfo);
+  }
+
+  // Get metadata for the base class
+  Defined *metaRo = getClassRo(baseClass, /*getMetaRo=*/true);
+  ConcatInputSection *metaIsec = dyn_cast<ConcatInputSection>(metaRo->isec());
+  Defined *classRo = getClassRo(baseClass, /*getMetaRo=*/false);
+  ConcatInputSection *classIsec = dyn_cast<ConcatInputSection>(classRo->isec());
+
+  // Now collect the info from the base class from the various lists in the
+  // class metadata
+
+  // Protocol lists are a special case - the same protocol list is in classRo
+  // and metaRo, so we only need to parse it once
+  parseProtocolListInfo(classIsec, roClassLayout.baseProtocolsOffset,
+                        extInfo.protocols);
+
+  // Check that the classRo and metaRo protocol lists are identical
+  assert(
+      parseProtocolListInfo(classIsec, roClassLayout.baseProtocolsOffset) ==
+          parseProtocolListInfo(metaIsec, roClassLayout.baseProtocolsOffset) &&
+      "Category merger expects classRo and metaRo to have the same protocol "
+      "list");
+
+  parsePointerListInfo(metaIsec, roClassLayout.baseMethodsOffset,
+                       extInfo.classMethods);
+  parsePointerListInfo(classIsec, roClassLayout.baseMethodsOffset,
+                       extInfo.instanceMethods);
+
+  parsePointerListInfo(metaIsec, roClassLayout.basePropertiesOffset,
+                       extInfo.classProps);
+  parsePointerListInfo(classIsec, roClassLayout.basePropertiesOffset,
+                       extInfo.instanceProps);
+
+  // Erase the old lists - these will be generated and replaced
+  eraseSymbolAtIsecOffset(metaIsec, roClassLayout.baseMethodsOffset);
+  eraseSymbolAtIsecOffset(metaIsec, roClassLayout.baseProtocolsOffset);
+  eraseSymbolAtIsecOffset(metaIsec, roClassLayout.basePropertiesOffset);
+  eraseSymbolAtIsecOffset(classIsec, roClassLayout.baseMethodsOffset);
+  eraseSymbolAtIsecOffset(classIsec, roClassLayout.baseProtocolsOffset);
+  eraseSymbolAtIsecOffset(classIsec, roClassLayout.basePropertiesOffset);
+
+  // Emit the newly merged lists - first into the meta RO then into the class RO
+  // First we emit and link the protocol list into the meta RO. Then we link it
+  // in the classRo as well (they're supposed to be identical)
+  if (Defined *protoListSym =
+          emitAndLinkProtocolList(metaRo, roClassLayout.baseProtocolsOffset,
+                                  extInfo, extInfo.protocols)) {
+    createSymbolReference(classRo, protoListSym,
+                          roClassLayout.baseProtocolsOffset,
+                          infoCategoryWriter.catBodyInfo.relocTemplate);
+  }
+
+  emitAndLinkPointerList(metaRo, roClassLayout.baseMethodsOffset, extInfo,
+                         extInfo.classMethods);
+  emitAndLinkPointerList(classRo, roClassLayout.baseMethodsOffset, extInfo,
+                         extInfo.instanceMethods);
+
+  emitAndLinkPointerList(metaRo, roClassLayout.basePropertiesOffset, extInfo,
+                         extInfo.classProps);
+
+  emitAndLinkPointerList(classRo, roClassLayout.basePropertiesOffset, extInfo,
+                         extInfo.instanceProps);
+
+  // Mark all the categories as merged - this will be used to erase them later
+  for (auto &catInfo : categories)
+    catInfo.wasMerged = true;
+}
+
+// Erase the symbol at a given offset in an InputSection
+void ObjcCategoryMerger::eraseSymbolAtIsecOffset(ConcatInputSection *isec,
+                                                 uint32_t offset) {
+  Defined *sym = tryGetDefinedAtIsecOffset(isec, offset);
+  if (!sym)
+    return;
+
+  // Remove the symbol from isec->symbols
+  assert(isa<Defined>(sym) && "Can only erase a Defined");
+  llvm::erase(isec->symbols, sym);
+
+  // Remove the relocs that refer to this symbol
+  auto removeAtOff = [offset](Reloc const &r) { return r.offset == offset; };
+  llvm::erase_if(isec->relocs, removeAtOff);
+
+  // Now, if the symbol fully occupies a ConcatInputSection, we can also erase
+  // the whole ConcatInputSection
+  if (ConcatInputSection *cisec = dyn_cast<ConcatInputSection>(sym->isec()))
+    if (cisec->data.size() == sym->size)
+      eraseISec(cisec);
+}
diff --git a/lld/test/MachO/objc-category-merging-complete-test.s b/lld/test/MachO/objc-category-merging-complete-test.s
index 74400177b550d..cf3e19e2f9c8b 100644
--- a/lld/test/MachO/objc-category-merging-complete-test.s
+++ b/lld/test/MachO/objc-category-merging-complete-test.s
@@ -1,6 +1,7 @@
 # REQUIRES: aarch64
 # RUN: rm -rf %t; split-file %s %t && cd %t
 
+############ Test merging multiple categories into a single category ############
 ## Create a dylib to link against(a64_file1.dylib) and merge categories in the main binary (file2_merge_a64.exe)
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_file1.o a64_file1.s
 # RUN: %lld -arch arm64 a64_file1.o -o a64_file1.dylib -dylib
@@ -12,6 +13,10 @@
 # RUN: llvm-objdump --objc-meta-data --macho a64_file2_no_merge.exe | FileCheck %s --check-prefixes=NO_MERGE_CATS
 # RUN: llvm-objdump --objc-meta-data --macho a64_file2_merge.exe | FileCheck %s --check-prefixes=MERGE_CATS
 
+############ Test merging multiple categories into the base class ############
+# RUN: %lld -arch arm64 -o a64_file2_merge_into_class.exe -objc_category_merging a64_file1.o a64_file2.o
+# RUN: llvm-objdump --objc-meta-data --macho a64_file2_merge_into_class.exe | FileCheck %s --check-prefixes=MERGE_CATS_CLS
+
 
 MERGE_CATS:     __OBJC_$_CATEGORY_MyBaseClass(Category02|Category03)
 MERGE_CATS-NEXT:              name {{.*}} Category02|Category03
@@ -101,6 +106,211 @@ NO_MERGE_CATS-NEXT: 24
 NO_MERGE_CATS-NEXT: 2
 
 
+MERGE_CATS_CLS:        _OBJC_CLASS_$_MyBaseClass
+MERGE_CATS_CLS-NEXT:            isa {{.*}} _OBJC_METACLASS_$_MyBaseClass
+MERGE_CATS_CLS-NEXT:     superclass 0x0
+MERGE_CATS_CLS-NEXT:          cache {{.*}} __objc_empty_cache
+MERGE_CATS_CLS-NEXT:         vtable 0x0
+MERGE_CATS_CLS-NEXT:           data {{.*}} (struct class_ro_t *)
+MERGE_CATS_CLS-NEXT:                     flags 0x2 RO_ROOT
+MERGE_CATS_CLS-NEXT:             instanceStart 0
+MERGE_CATS_CLS-NEXT:              instanceSize 4
+MERGE_CATS_CLS-NEXT:                  reserved 0x0
+MERGE_CATS_CLS-NEXT:                ivarLayout 0x0
+MERGE_CATS_CLS-NEXT:                      name {{.*}} MyBaseClass
+MERGE_CATS_CLS-NEXT:               baseMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:            entsize 24
+MERGE_CATS_CLS-NEXT:              count 8
+MERGE_CATS_CLS-NEXT:               name {{.*}} class02InstanceMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass(Category02) class02InstanceMethod]
+MERGE_CATS_CLS-NEXT:               name {{.*}} myProtocol02Method
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass(Category02) myProtocol02Method]
+MERGE_CATS_CLS-NEXT:               name {{.*}} class03InstanceMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass(Category03) class03InstanceMethod]
+MERGE_CATS_CLS-NEXT:               name {{.*}} myProtocol03Method
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass(Category03) myProtocol03Method]
+MERGE_CATS_CLS-NEXT:               name {{.*}} baseInstanceMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass baseInstanceMethod]
+MERGE_CATS_CLS-NEXT:               name {{.*}} myProtocol01Method
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass myProtocol01Method]
+MERGE_CATS_CLS-NEXT:               name {{.*}} MyProtocol01Prop
+MERGE_CATS_CLS-NEXT:              types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass MyProtocol01Prop]
+MERGE_CATS_CLS-NEXT:               name {{.*}} setMyProtocol01Prop:
+MERGE_CATS_CLS-NEXT:              types {{.*}} v20@0:8i16
+MERGE_CATS_CLS-NEXT:                imp -[MyBaseClass setMyProtocol01Prop:]
+MERGE_CATS_CLS-NEXT:             baseProtocols {{.*}}
+MERGE_CATS_CLS-NEXT:                       count 3
+MERGE_CATS_CLS-NEXT:               list[0] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol02
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 2
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol02Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol02Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:               list[1] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol03
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 2
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol03Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol03Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:               list[2] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol01
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 3
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol01Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol01Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} setMyProtocol01Prop:
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v20@0:8i16
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:                     ivars {{.*}}
+MERGE_CATS_CLS-NEXT:                     entsize 32
+MERGE_CATS_CLS-NEXT:                       count 1
+MERGE_CATS_CLS-NEXT:                offset {{.*}} 0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol01Prop
+MERGE_CATS_CLS-NEXT:                  type {{.*}} i
+MERGE_CATS_CLS-NEXT:             alignment 2
+MERGE_CATS_CLS-NEXT:                  size 4
+MERGE_CATS_CLS-NEXT:            weakIvarLayout 0x0
+MERGE_CATS_CLS-NEXT:            baseProperties {{.*}}
+MERGE_CATS_CLS-NEXT:                     entsize 16
+MERGE_CATS_CLS-NEXT:                       count 3
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol02Prop
+MERGE_CATS_CLS-NEXT:             attributes {{.*}} Ti,R,D
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol03Prop
+MERGE_CATS_CLS-NEXT:             attributes {{.*}} Ti,R,D
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol01Prop
+MERGE_CATS_CLS-NEXT:             attributes {{.*}} Ti,N,VMyProtocol01Prop
+MERGE_CATS_CLS-NEXT: Meta Class
+MERGE_CATS_CLS-NEXT:            isa {{.*}} _OBJC_METACLASS_$_MyBaseClass
+MERGE_CATS_CLS-NEXT:     superclass {{.*}} _OBJC_CLASS_$_MyBaseClass
+MERGE_CATS_CLS-NEXT:          cache {{.*}} __objc_empty_cache
+MERGE_CATS_CLS-NEXT:         vtable 0x0
+MERGE_CATS_CLS-NEXT:           data {{.*}} (struct class_ro_t *)
+MERGE_CATS_CLS-NEXT:                     flags 0x3 RO_META RO_ROOT
+MERGE_CATS_CLS-NEXT:             instanceStart 40
+MERGE_CATS_CLS-NEXT:              instanceSize 40
+MERGE_CATS_CLS-NEXT:                  reserved 0x0
+MERGE_CATS_CLS-NEXT:                ivarLayout 0x0
+MERGE_CATS_CLS-NEXT:                      name {{.*}} MyBaseClass
+MERGE_CATS_CLS-NEXT:               baseMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:            entsize 24
+MERGE_CATS_CLS-NEXT:              count 5
+MERGE_CATS_CLS-NEXT:               name {{.*}} class02ClassMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp +[MyBaseClass(Category02) class02ClassMethod]
+MERGE_CATS_CLS-NEXT:               name {{.*}} MyProtocol02Prop
+MERGE_CATS_CLS-NEXT:              types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                imp +[MyBaseClass(Category02) MyProtocol02Prop]
+MERGE_CATS_CLS-NEXT:               name {{.*}} class03ClassMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp +[MyBaseClass(Category03) class03ClassMethod]
+MERGE_CATS_CLS-NEXT:               name {{.*}} MyProtocol03Prop
+MERGE_CATS_CLS-NEXT:              types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                imp +[MyBaseClass(Category03) MyProtocol03Prop]
+MERGE_CATS_CLS-NEXT:               name {{.*}} baseClassMethod
+MERGE_CATS_CLS-NEXT:              types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                imp +[MyBaseClass baseClassMethod]
+MERGE_CATS_CLS-NEXT:             baseProtocols {{.*}}
+MERGE_CATS_CLS-NEXT:                       count 3
+MERGE_CATS_CLS-NEXT:               list[0] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol02
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 2
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol02Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol02Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:               list[1] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol03
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 2
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol03Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol03Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:               list[2] {{.*}} (struct protocol_t *)
+MERGE_CATS_CLS-NEXT:                   isa 0x0
+MERGE_CATS_CLS-NEXT:                  name {{.*}} MyProtocol01
+MERGE_CATS_CLS-NEXT:             protocols 0x0
+MERGE_CATS_CLS-NEXT:           instanceMethods {{.*}} (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:                entsize 24
+MERGE_CATS_CLS-NEXT:                  count 3
+MERGE_CATS_CLS-NEXT:                   name {{.*}} myProtocol01Method
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} MyProtocol01Prop
+MERGE_CATS_CLS-NEXT:                  types {{.*}} i16@0:8
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:                   name {{.*}} setMyProtocol01Prop:
+MERGE_CATS_CLS-NEXT:                  types {{.*}} v20@0:8i16
+MERGE_CATS_CLS-NEXT:                    imp 0x0
+MERGE_CATS_CLS-NEXT:              classMethods 0x0 (struct method_list_t *)
+MERGE_CATS_CLS-NEXT:       optionalInstanceMethods 0x0
+MERGE_CATS_CLS-NEXT:          optionalClassMethods 0x0
+MERGE_CATS_CLS-NEXT:            instanceProperties {{.*}}
+MERGE_CATS_CLS-NEXT:                     ivars 0x0
+MERGE_CATS_CLS-NEXT:            weakIvarLayout 0x0
+MERGE_CATS_CLS-NEXT:            baseProperties 0x0
+MERGE_CATS_CLS:        __OBJC_$_CATEGORY_MyBaseClass_$_Category04
+
+
 #--- a64_file1.s
 
 ## @protocol MyProtocol01
diff --git a/lld/test/MachO/objc-category-merging-extern-class-minimal.s b/lld/test/MachO/objc-category-merging-minimal.s
similarity index 59%
rename from lld/test/MachO/objc-category-merging-extern-class-minimal.s
rename to lld/test/MachO/objc-category-merging-minimal.s
index 5dd8924df5ad6..fcd90f178b150 100644
--- a/lld/test/MachO/objc-category-merging-extern-class-minimal.s
+++ b/lld/test/MachO/objc-category-merging-minimal.s
@@ -1,7 +1,8 @@
 # REQUIRES: aarch64
 # RUN: rm -rf %t; split-file %s %t && cd %t
 
-## Create a dylib with a fake base class to link against
+############ Test merging multiple categories into a single category ############
+## Create a dylib with a fake base class to link against in when merging between categories
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_fakedylib.o a64_fakedylib.s
 # RUN: %lld -arch arm64 a64_fakedylib.o -o a64_fakedylib.dylib -dylib
 
@@ -14,6 +15,15 @@
 # RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_no_merge.dylib | FileCheck %s --check-prefixes=NO_MERGE_CATS
 # RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_merge.dylib | FileCheck %s --check-prefixes=MERGE_CATS
 
+############ Test merging multiple categories into the base class ############
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o merge_base_class_minimal.o merge_base_class_minimal.s
+# RUN: %lld -arch arm64 -dylib -o merge_base_class_minimal_yes_merge.dylib -objc_category_merging merge_base_class_minimal.o merge_cat_minimal.o
+# RUN: %lld -arch arm64 -dylib -o merge_base_class_minimal_no_merge.dylib merge_base_class_minimal.o merge_cat_minimal.o
+
+# RUN: llvm-objdump --objc-meta-data --macho merge_base_class_minimal_no_merge.dylib  | FileCheck %s --check-prefixes=NO_MERGE_INTO_BASE
+# RUN: llvm-objdump --objc-meta-data --macho merge_base_class_minimal_yes_merge.dylib | FileCheck %s --check-prefixes=YES_MERGE_INTO_BASE
+
+
 #### Check merge categories enabled ###
 # Check that the original categories are not there
 MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
@@ -44,6 +54,28 @@ NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
 NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
 
 
+#### Check merge cateogires into base class is disabled ####
+NO_MERGE_INTO_BASE: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+NO_MERGE_INTO_BASE: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+
+#### Check merge cateogires into base class is enabled and categories are merged into base class ####
+YES_MERGE_INTO_BASE-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+YES_MERGE_INTO_BASE-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+
+YES_MERGE_INTO_BASE: _OBJC_CLASS_$_MyBaseClass
+YES_MERGE_INTO_BASE-NEXT: _OBJC_METACLASS_$_MyBaseClass
+YES_MERGE_INTO_BASE: baseMethods
+YES_MERGE_INTO_BASE-NEXT: entsize 24
+YES_MERGE_INTO_BASE-NEXT: count 3
+YES_MERGE_INTO_BASE-NEXT: name {{.*}} cat01_InstanceMethod
+YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8
+YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass(Category01) cat01_InstanceMethod]
+YES_MERGE_INTO_BASE-NEXT: name {{.*}} cat02_InstanceMethod
+YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8
+YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass(Category02) cat02_InstanceMethod]
+YES_MERGE_INTO_BASE-NEXT: name {{.*}} baseInstanceMethod
+YES_MERGE_INTO_BASE-NEXT: types {{.*}} v16@0:8
+YES_MERGE_INTO_BASE-NEXT: imp -[MyBaseClass baseInstanceMethod]
 
 #--- a64_fakedylib.s
 
@@ -156,3 +188,94 @@ L_OBJC_IMAGE_INFO:
 
 .addrsig
 .addrsig_sym __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+
+#--- merge_base_class_minimal.s
+; clang -c merge_base_class_minimal.mm -O3 -target arm64-apple-macos -arch arm64 -S -o merge_base_class_minimal.s
+;  ================== Generated from ObjC: ==================
+; __attribute__((objc_root_class))
+; @interface MyBaseClass
+; - (void)baseInstanceMethod;
+; @end
+;
+; @implementation MyBaseClass
+; - (void)baseInstanceMethod {}
+; @end
+;  ================== Generated from ObjC  ==================
+	.section	__TEXT,__text,regular,pure_instructions
+	.build_version macos, 11, 0
+	.p2align	2
+"-[MyBaseClass baseInstanceMethod]":
+	.cfi_startproc
+; %bb.0:
+	ret
+	.cfi_endproc
+	.section	__DATA,__objc_data
+	.globl	_OBJC_CLASS_$_MyBaseClass
+	.p2align	3, 0x0
+_OBJC_CLASS_$_MyBaseClass:
+	.quad	_OBJC_METACLASS_$_MyBaseClass
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	__OBJC_CLASS_RO_$_MyBaseClass
+	.globl	_OBJC_METACLASS_$_MyBaseClass
+	.p2align	3, 0x0
+_OBJC_METACLASS_$_MyBaseClass:
+	.quad	_OBJC_METACLASS_$_MyBaseClass
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	0
+	.quad	0
+	.quad	__OBJC_METACLASS_RO_$_MyBaseClass
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:
+	.asciz	"MyBaseClass"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0
+__OBJC_METACLASS_RO_$_MyBaseClass:
+	.long	3
+	.long	40
+	.long	40
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:
+	.asciz	"baseInstanceMethod"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:
+	.asciz	"v16@0:8"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0
+__OBJC_$_INSTANCE_METHODS_MyBaseClass:
+	.long	24
+	.long	1
+	.quad	l_OBJC_METH_VAR_NAME_
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass baseInstanceMethod]"
+	.p2align	3, 0x0
+__OBJC_CLASS_RO_$_MyBaseClass:
+	.long	2
+	.long	0
+	.long	0
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	__OBJC_$_INSTANCE_METHODS_MyBaseClass
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.section	__DATA,__objc_classlist,regular,no_dead_strip
+	.p2align	3, 0x0
+l_OBJC_LABEL_CLASS_$:
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+	.long	0
+	.long	64
+.subsections_via_symbols
diff --git a/lld/wasm/WriterUtils.cpp b/lld/wasm/WriterUtils.cpp
index cdd2c42f939ef..c6a1592012e64 100644
--- a/lld/wasm/WriterUtils.cpp
+++ b/lld/wasm/WriterUtils.cpp
@@ -35,6 +35,8 @@ std::string toString(ValType type) {
     return "funcref";
   case ValType::EXTERNREF:
     return "externref";
+  case ValType::EXNREF:
+    return "exnref";
   case ValType::OTHERREF:
     return "otherref";
   }
diff --git a/lldb/bindings/headers.swig b/lldb/bindings/headers.swig
index ffdc3c31ec883..c91504604b6ac 100644
--- a/lldb/bindings/headers.swig
+++ b/lldb/bindings/headers.swig
@@ -8,6 +8,8 @@
 %{
 #include "lldb/lldb-public.h"
 #include "lldb/API/SBAddress.h"
+#include "lldb/API/SBAddressRange.h"
+#include "lldb/API/SBAddressRangeList.h"
 #include "lldb/API/SBAttachInfo.h"
 #include "lldb/API/SBBlock.h"
 #include "lldb/API/SBBreakpoint.h"
diff --git a/lldb/bindings/interface/SBAddressRangeDocstrings.i b/lldb/bindings/interface/SBAddressRangeDocstrings.i
new file mode 100644
index 0000000000000..650195704d73e
--- /dev/null
+++ b/lldb/bindings/interface/SBAddressRangeDocstrings.i
@@ -0,0 +1,3 @@
+%feature("docstring",
+"API clients can get address range information."
+) lldb::SBAddressRange;
diff --git a/lldb/bindings/interface/SBAddressRangeExtensions.i b/lldb/bindings/interface/SBAddressRangeExtensions.i
new file mode 100644
index 0000000000000..31bcfcb64590b
--- /dev/null
+++ b/lldb/bindings/interface/SBAddressRangeExtensions.i
@@ -0,0 +1,11 @@
+%extend lldb::SBAddressRange {
+#ifdef SWIGPYTHON
+    %pythoncode%{
+      def __repr__(self):
+        import lldb
+        stream = lldb.SBStream()
+        self.GetDescription(stream, lldb.target if lldb.target else lldb.SBTarget())
+        return stream.GetData()
+    %}
+#endif
+}
diff --git a/lldb/bindings/interface/SBAddressRangeListDocstrings.i b/lldb/bindings/interface/SBAddressRangeListDocstrings.i
new file mode 100644
index 0000000000000..e4b96b9ca5931
--- /dev/null
+++ b/lldb/bindings/interface/SBAddressRangeListDocstrings.i
@@ -0,0 +1,3 @@
+%feature("docstring",
+"Represents a list of :py:class:`SBAddressRange`."
+) lldb::SBAddressRangeList;
diff --git a/lldb/bindings/interface/SBAddressRangeListExtensions.i b/lldb/bindings/interface/SBAddressRangeListExtensions.i
new file mode 100644
index 0000000000000..e281a84d73d27
--- /dev/null
+++ b/lldb/bindings/interface/SBAddressRangeListExtensions.i
@@ -0,0 +1,29 @@
+%extend lldb::SBAddressRangeList {
+#ifdef SWIGPYTHON
+    %pythoncode%{
+    def __len__(self):
+      '''Return the number of address ranges in a lldb.SBAddressRangeList object.'''
+      return self.GetSize()
+
+    def __iter__(self):
+      '''Iterate over all the address ranges in a lldb.SBAddressRangeList object.'''
+      return lldb_iter(self, 'GetSize', 'GetAddressRangeAtIndex')
+
+    def __getitem__(self, idx):
+      '''Get the address range at a given index in an lldb.SBAddressRangeList object.'''
+      if not isinstance(idx, int):
+        raise TypeError("unsupported index type: %s" % type(idx))
+      count = len(self)
+      if not (-count <= idx < count):
+        raise IndexError("list index out of range")
+      idx %= count
+      return self.GetAddressRangeAtIndex(idx)
+
+    def __repr__(self):
+      import lldb
+      stream = lldb.SBStream()
+      self.GetDescription(stream, lldb.target if lldb.target else lldb.SBTarget())
+      return stream.GetData()
+    %}
+#endif
+}
diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig
index 2a29a8dd7ef0b..0953f4c72a910 100644
--- a/lldb/bindings/interfaces.swig
+++ b/lldb/bindings/interfaces.swig
@@ -12,6 +12,8 @@
 
 /* Docstrings for SB classes and methods */
 %include "./interface/SBAddressDocstrings.i"
+%include "./interface/SBAddressRangeDocstrings.i"
+%include "./interface/SBAddressRangeListDocstrings.i"
 %include "./interface/SBAttachInfoDocstrings.i"
 %include "./interface/SBBlockDocstrings.i"
 %include "./interface/SBBreakpointDocstrings.i"
@@ -86,6 +88,8 @@
 
 /* API headers */
 %include "lldb/API/SBAddress.h"
+%include "lldb/API/SBAddressRange.h"
+%include "lldb/API/SBAddressRangeList.h"
 %include "lldb/API/SBAttachInfo.h"
 %include "lldb/API/SBBlock.h"
 %include "lldb/API/SBBreakpoint.h"
@@ -163,6 +167,8 @@
 
 /* Extensions for SB classes */
 %include "./interface/SBAddressExtensions.i"
+%include "./interface/SBAddressRangeExtensions.i"
+%include "./interface/SBAddressRangeListExtensions.i"
 %include "./interface/SBBlockExtensions.i"
 %include "./interface/SBBreakpointExtensions.i"
 %include "./interface/SBBreakpointListExtensions.i"
diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h
index b256544326a22..d8cc9f5067fe9 100644
--- a/lldb/include/lldb/API/LLDB.h
+++ b/lldb/include/lldb/API/LLDB.h
@@ -10,6 +10,8 @@
 #define LLDB_API_LLDB_H
 
 #include "lldb/API/SBAddress.h"
+#include "lldb/API/SBAddressRange.h"
+#include "lldb/API/SBAddressRangeList.h"
 #include "lldb/API/SBAttachInfo.h"
 #include "lldb/API/SBBlock.h"
 #include "lldb/API/SBBreakpoint.h"
diff --git a/lldb/include/lldb/API/SBAddress.h b/lldb/include/lldb/API/SBAddress.h
index 5e5f355ccc390..430dad4862dbf 100644
--- a/lldb/include/lldb/API/SBAddress.h
+++ b/lldb/include/lldb/API/SBAddress.h
@@ -86,6 +86,7 @@ class LLDB_API SBAddress {
   lldb::SBLineEntry GetLineEntry();
 
 protected:
+  friend class SBAddressRange;
   friend class SBBlock;
   friend class SBBreakpoint;
   friend class SBBreakpointLocation;
diff --git a/lldb/include/lldb/API/SBAddressRange.h b/lldb/include/lldb/API/SBAddressRange.h
new file mode 100644
index 0000000000000..152bd82426af1
--- /dev/null
+++ b/lldb/include/lldb/API/SBAddressRange.h
@@ -0,0 +1,66 @@
+//===-- SBAddressRange.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_API_SBADDRESSRANGE_H
+#define LLDB_API_SBADDRESSRANGE_H
+
+#include "lldb/API/SBDefines.h"
+
+namespace lldb {
+
+class LLDB_API SBAddressRange {
+public:
+  SBAddressRange();
+
+  SBAddressRange(const lldb::SBAddressRange &rhs);
+
+  SBAddressRange(lldb::SBAddress addr, lldb::addr_t byte_size);
+
+  ~SBAddressRange();
+
+  const lldb::SBAddressRange &operator=(const lldb::SBAddressRange &rhs);
+
+  void Clear();
+
+  /// Check the address range refers to a valid base address and has a byte
+  /// size greater than zero.
+  ///
+  /// \return
+  ///     True if the address range is valid, false otherwise.
+  bool IsValid() const;
+
+  /// Get the base address of the range.
+  ///
+  /// \return
+  ///     Base address object.
+  lldb::SBAddress GetBaseAddress() const;
+
+  /// Get the byte size of this range.
+  ///
+  /// \return
+  ///     The size in bytes of this address range.
+  lldb::addr_t GetByteSize() const;
+
+  bool operator==(const SBAddressRange &rhs);
+
+  bool operator!=(const SBAddressRange &rhs);
+
+  bool GetDescription(lldb::SBStream &description, const SBTarget target);
+
+private:
+  friend class SBAddressRangeList;
+  friend class SBBlock;
+  friend class SBFunction;
+  friend class SBProcess;
+
+  AddressRangeUP m_opaque_up;
+};
+
+} // namespace lldb
+
+#endif // LLDB_API_SBADDRESSRANGE_H
diff --git a/lldb/include/lldb/API/SBAddressRangeList.h b/lldb/include/lldb/API/SBAddressRangeList.h
new file mode 100644
index 0000000000000..a123287ef1b4f
--- /dev/null
+++ b/lldb/include/lldb/API/SBAddressRangeList.h
@@ -0,0 +1,54 @@
+//===-- SBAddressRangeList.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_API_SBADDRESSRANGELIST_H
+#define LLDB_API_SBADDRESSRANGELIST_H
+
+#include <memory>
+
+#include "lldb/API/SBDefines.h"
+
+namespace lldb_private {
+class AddressRangeListImpl;
+}
+
+namespace lldb {
+
+class LLDB_API SBAddressRangeList {
+public:
+  SBAddressRangeList();
+
+  SBAddressRangeList(const lldb::SBAddressRangeList &rhs);
+
+  ~SBAddressRangeList();
+
+  const lldb::SBAddressRangeList &
+  operator=(const lldb::SBAddressRangeList &rhs);
+
+  uint32_t GetSize() const;
+
+  void Clear();
+
+  SBAddressRange GetAddressRangeAtIndex(uint64_t idx);
+
+  void Append(const lldb::SBAddressRange &addr_range);
+
+  void Append(const lldb::SBAddressRangeList &addr_range_list);
+
+  bool GetDescription(lldb::SBStream &description, const SBTarget &target);
+
+private:
+  friend class SBBlock;
+  friend class SBProcess;
+
+  std::unique_ptr<lldb_private::AddressRangeListImpl> m_opaque_up;
+};
+
+} // namespace lldb
+
+#endif // LLDB_API_SBADDRESSRANGELIST_H
diff --git a/lldb/include/lldb/API/SBBlock.h b/lldb/include/lldb/API/SBBlock.h
index 2570099f7652f..de4bb22be2692 100644
--- a/lldb/include/lldb/API/SBBlock.h
+++ b/lldb/include/lldb/API/SBBlock.h
@@ -9,6 +9,8 @@
 #ifndef LLDB_API_SBBLOCK_H
 #define LLDB_API_SBBLOCK_H
 
+#include "lldb/API/SBAddressRange.h"
+#include "lldb/API/SBAddressRangeList.h"
 #include "lldb/API/SBDefines.h"
 #include "lldb/API/SBFrame.h"
 #include "lldb/API/SBTarget.h"
@@ -52,6 +54,8 @@ class LLDB_API SBBlock {
 
   lldb::SBAddress GetRangeEndAddress(uint32_t idx);
 
+  lldb::SBAddressRangeList GetRanges();
+
   uint32_t GetRangeIndexForBlockAddress(lldb::SBAddress block_addr);
 
   lldb::SBValueList GetVariables(lldb::SBFrame &frame, bool arguments,
diff --git a/lldb/include/lldb/API/SBDefines.h b/lldb/include/lldb/API/SBDefines.h
index 1181920677b46..87c0a1c3661ca 100644
--- a/lldb/include/lldb/API/SBDefines.h
+++ b/lldb/include/lldb/API/SBDefines.h
@@ -43,6 +43,8 @@
 namespace lldb {
 
 class LLDB_API SBAddress;
+class LLDB_API SBAddressRange;
+class LLDB_API SBAddressRangeList;
 class LLDB_API SBAttachInfo;
 class LLDB_API SBBlock;
 class LLDB_API SBBreakpoint;
diff --git a/lldb/include/lldb/API/SBFunction.h b/lldb/include/lldb/API/SBFunction.h
index 71b372a818e4b..df607fdc7ebf5 100644
--- a/lldb/include/lldb/API/SBFunction.h
+++ b/lldb/include/lldb/API/SBFunction.h
@@ -10,6 +10,7 @@
 #define LLDB_API_SBFUNCTION_H
 
 #include "lldb/API/SBAddress.h"
+#include "lldb/API/SBAddressRangeList.h"
 #include "lldb/API/SBDefines.h"
 #include "lldb/API/SBInstructionList.h"
 
@@ -44,6 +45,8 @@ class LLDB_API SBFunction {
 
   lldb::SBAddress GetEndAddress();
 
+  lldb::SBAddressRangeList GetRanges();
+
   const char *GetArgumentName(uint32_t arg_idx);
 
   uint32_t GetPrologueByteSize();
diff --git a/lldb/include/lldb/API/SBStream.h b/lldb/include/lldb/API/SBStream.h
index 0e33f05b69916..71caf41fd7549 100644
--- a/lldb/include/lldb/API/SBStream.h
+++ b/lldb/include/lldb/API/SBStream.h
@@ -62,6 +62,8 @@ class LLDB_API SBStream {
 
 protected:
   friend class SBAddress;
+  friend class SBAddressRange;
+  friend class SBAddressRangeList;
   friend class SBBlock;
   friend class SBBreakpoint;
   friend class SBBreakpointLocation;
diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h
index feeaa1cb71132..35c2ed9c20a23 100644
--- a/lldb/include/lldb/API/SBTarget.h
+++ b/lldb/include/lldb/API/SBTarget.h
@@ -943,6 +943,7 @@ class LLDB_API SBTarget {
 
 protected:
   friend class SBAddress;
+  friend class SBAddressRange;
   friend class SBBlock;
   friend class SBBreakpoint;
   friend class SBBreakpointList;
diff --git a/lldb/include/lldb/Core/AddressRange.h b/lldb/include/lldb/Core/AddressRange.h
index 4a33c2d795876..68a3ad0edd2d7 100644
--- a/lldb/include/lldb/Core/AddressRange.h
+++ b/lldb/include/lldb/Core/AddressRange.h
@@ -86,6 +86,8 @@ class AddressRange {
   /// (LLDB_INVALID_ADDRESS) and a zero byte size.
   void Clear();
 
+  bool IsValid() const;
+
   /// Check if a section offset address is contained in this range.
   ///
   /// \param[in] so_addr
@@ -236,12 +238,24 @@ class AddressRange {
   ///     The new size in bytes of this address range.
   void SetByteSize(lldb::addr_t byte_size) { m_byte_size = byte_size; }
 
+  bool GetDescription(Stream *s, Target *target) const;
+
+  bool operator==(const AddressRange &rhs);
+
+  bool operator!=(const AddressRange &rhs);
+
 protected:
   // Member variables
   Address m_base_addr;      ///< The section offset base address of this range.
   lldb::addr_t m_byte_size = 0; ///< The size in bytes of this address range.
 };
 
+// Forward-declarable wrapper.
+class AddressRanges : public std::vector<lldb_private::AddressRange> {
+public:
+  using std::vector<lldb_private::AddressRange>::vector;
+};
+
 } // namespace lldb_private
 
 #endif // LLDB_CORE_ADDRESSRANGE_H
diff --git a/lldb/include/lldb/Core/AddressRangeListImpl.h b/lldb/include/lldb/Core/AddressRangeListImpl.h
new file mode 100644
index 0000000000000..46ebfe73d4d92
--- /dev/null
+++ b/lldb/include/lldb/Core/AddressRangeListImpl.h
@@ -0,0 +1,51 @@
+//===-- AddressRangeListImpl.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_CORE_ADDRESSRANGELISTIMPL_H
+#define LLDB_CORE_ADDRESSRANGELISTIMPL_H
+
+#include "lldb/Core/AddressRange.h"
+#include <cstddef>
+
+namespace lldb {
+class SBBlock;
+}
+
+namespace lldb_private {
+
+class AddressRangeListImpl {
+public:
+  AddressRangeListImpl();
+
+  AddressRangeListImpl(const AddressRangeListImpl &rhs) = default;
+
+  AddressRangeListImpl &operator=(const AddressRangeListImpl &rhs);
+
+  size_t GetSize() const;
+
+  void Reserve(size_t capacity);
+
+  void Append(const AddressRange &sb_region);
+
+  void Append(const AddressRangeListImpl &list);
+
+  void Clear();
+
+  lldb_private::AddressRange GetAddressRangeAtIndex(size_t index);
+
+private:
+  friend class lldb::SBBlock;
+
+  AddressRanges &ref();
+
+  AddressRanges m_ranges;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_CORE_ADDRESSRANGE_H
diff --git a/lldb/include/lldb/Symbol/Block.h b/lldb/include/lldb/Symbol/Block.h
index 02fd2add53103..c9c4d5ad767d7 100644
--- a/lldb/include/lldb/Symbol/Block.h
+++ b/lldb/include/lldb/Symbol/Block.h
@@ -355,6 +355,8 @@ class Block : public UserID, public SymbolContextScope {
   // be able to get at any of the address ranges in a block.
   bool GetRangeAtIndex(uint32_t range_idx, AddressRange &range);
 
+  AddressRanges GetRanges();
+
   bool GetStartAddress(Address &addr);
 
   void SetDidParseVariables(bool b, bool set_children);
diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h
index 7aa0852676e46..c6f30cde81867 100644
--- a/lldb/include/lldb/Symbol/Type.h
+++ b/lldb/include/lldb/Symbol/Type.h
@@ -62,6 +62,8 @@ struct CompilerContext {
   CompilerContextKind kind;
   ConstString name;
 };
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const CompilerContext &rhs);
 
 /// Match \p context_chain against \p pattern, which may contain "Any"
 /// kinds. The \p context_chain should *not* contain any "Any" kinds.
diff --git a/lldb/include/lldb/Target/RegisterFlags.h b/lldb/include/lldb/Target/RegisterFlags.h
index 9b343e445678a..29a47540cd4f5 100644
--- a/lldb/include/lldb/Target/RegisterFlags.h
+++ b/lldb/include/lldb/Target/RegisterFlags.h
@@ -15,7 +15,7 @@
 
 namespace lldb_private {
 
-class StreamString;
+class Stream;
 class Log;
 
 class RegisterFlags {
@@ -56,7 +56,7 @@ class RegisterFlags {
     /// Output XML that describes this field, to be inserted into a target XML
     /// file. Reserved characters in field names like "<" are replaced with
     /// their XML safe equivalents like "&gt;".
-    void ToXML(StreamString &strm) const;
+    void ToXML(Stream &strm) const;
 
     bool operator<(const Field &rhs) const {
       return GetStart() < rhs.GetStart();
@@ -119,7 +119,7 @@ class RegisterFlags {
   std::string AsTable(uint32_t max_width) const;
 
   // Output XML that describes this set of flags.
-  void ToXML(StreamString &strm) const;
+  void ToXML(Stream &strm) const;
 
 private:
   const std::string m_id;
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index 10ba921b9dac8..6d880b4da03c9 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -19,6 +19,8 @@ class ASTResultSynthesizer;
 class ASTStructExtractor;
 class Address;
 class AddressRange;
+class AddressRanges;
+class AddressRangeList;
 class AddressResolver;
 class ArchSpec;
 class Architecture;
@@ -308,6 +310,7 @@ template <unsigned N> class StreamBuffer;
 namespace lldb {
 
 typedef std::shared_ptr<lldb_private::ABI> ABISP;
+typedef std::unique_ptr<lldb_private::AddressRange> AddressRangeUP;
 typedef std::shared_ptr<lldb_private::Baton> BatonSP;
 typedef std::shared_ptr<lldb_private::Block> BlockSP;
 typedef std::shared_ptr<lldb_private::Breakpoint> BreakpointSP;
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index e8228afe103f9..6397101609315 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -42,6 +42,8 @@ set_target_properties(lldb-sbapi-dwarf-enums PROPERTIES FOLDER "LLDB/Tablegennin
 
 add_lldb_library(liblldb SHARED ${option_framework}
   SBAddress.cpp
+  SBAddressRange.cpp
+  SBAddressRangeList.cpp
   SBAttachInfo.cpp
   SBBlock.cpp
   SBBreakpoint.cpp
diff --git a/lldb/source/API/SBAddressRange.cpp b/lldb/source/API/SBAddressRange.cpp
new file mode 100644
index 0000000000000..9b1affdade439
--- /dev/null
+++ b/lldb/source/API/SBAddressRange.cpp
@@ -0,0 +1,103 @@
+//===-- SBAddressRange.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/API/SBAddressRange.h"
+#include "Utils.h"
+#include "lldb/API/SBAddress.h"
+#include "lldb/API/SBStream.h"
+#include "lldb/API/SBTarget.h"
+#include "lldb/Core/AddressRange.h"
+#include "lldb/Core/Section.h"
+#include "lldb/Utility/Instrumentation.h"
+#include "lldb/Utility/Stream.h"
+#include <cstddef>
+#include <memory>
+
+using namespace lldb;
+using namespace lldb_private;
+
+SBAddressRange::SBAddressRange()
+    : m_opaque_up(std::make_unique<AddressRange>()) {
+  LLDB_INSTRUMENT_VA(this);
+}
+
+SBAddressRange::SBAddressRange(const SBAddressRange &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  m_opaque_up = clone(rhs.m_opaque_up);
+}
+
+SBAddressRange::SBAddressRange(lldb::SBAddress addr, lldb::addr_t byte_size)
+    : m_opaque_up(std::make_unique<AddressRange>(addr.ref(), byte_size)) {
+  LLDB_INSTRUMENT_VA(this, addr, byte_size);
+}
+
+SBAddressRange::~SBAddressRange() = default;
+
+const SBAddressRange &SBAddressRange::operator=(const SBAddressRange &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  if (this != &rhs)
+    m_opaque_up = clone(rhs.m_opaque_up);
+  return *this;
+}
+
+bool SBAddressRange::operator==(const SBAddressRange &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  if (!IsValid() || !rhs.IsValid())
+    return false;
+  return m_opaque_up->operator==(*(rhs.m_opaque_up));
+}
+
+bool SBAddressRange::operator!=(const SBAddressRange &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  return !(*this == rhs);
+}
+
+void SBAddressRange::Clear() {
+  LLDB_INSTRUMENT_VA(this);
+
+  m_opaque_up.reset();
+}
+
+bool SBAddressRange::IsValid() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  return m_opaque_up && m_opaque_up->IsValid();
+}
+
+lldb::SBAddress SBAddressRange::GetBaseAddress() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (!IsValid())
+    return lldb::SBAddress();
+  return lldb::SBAddress(m_opaque_up->GetBaseAddress());
+}
+
+lldb::addr_t SBAddressRange::GetByteSize() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (!IsValid())
+    return 0;
+  return m_opaque_up->GetByteSize();
+}
+
+bool SBAddressRange::GetDescription(SBStream &description,
+                                    const SBTarget target) {
+  LLDB_INSTRUMENT_VA(this, description, target);
+
+  Stream &stream = description.ref();
+  if (!IsValid()) {
+    stream << "<invalid>";
+    return true;
+  }
+  m_opaque_up->GetDescription(&stream, target.GetSP().get());
+  return true;
+}
diff --git a/lldb/source/API/SBAddressRangeList.cpp b/lldb/source/API/SBAddressRangeList.cpp
new file mode 100644
index 0000000000000..20660b3ff2088
--- /dev/null
+++ b/lldb/source/API/SBAddressRangeList.cpp
@@ -0,0 +1,94 @@
+//===-- SBAddressRangeList.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/API/SBAddressRangeList.h"
+#include "Utils.h"
+#include "lldb/API/SBAddressRange.h"
+#include "lldb/API/SBStream.h"
+#include "lldb/API/SBTarget.h"
+#include "lldb/Core/AddressRangeListImpl.h"
+#include "lldb/Utility/Instrumentation.h"
+#include "lldb/Utility/Stream.h"
+
+#include <memory>
+
+using namespace lldb;
+using namespace lldb_private;
+
+SBAddressRangeList::SBAddressRangeList()
+    : m_opaque_up(std::make_unique<AddressRangeListImpl>()) {
+  LLDB_INSTRUMENT_VA(this);
+}
+
+SBAddressRangeList::SBAddressRangeList(const SBAddressRangeList &rhs)
+    : m_opaque_up(std::make_unique<AddressRangeListImpl>(*rhs.m_opaque_up)) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+}
+
+SBAddressRangeList::~SBAddressRangeList() = default;
+
+const SBAddressRangeList &
+SBAddressRangeList::operator=(const SBAddressRangeList &rhs) {
+  LLDB_INSTRUMENT_VA(this, rhs);
+
+  if (this != &rhs)
+    *m_opaque_up = *rhs.m_opaque_up;
+  return *this;
+}
+
+uint32_t SBAddressRangeList::GetSize() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  return m_opaque_up->GetSize();
+}
+
+SBAddressRange SBAddressRangeList::GetAddressRangeAtIndex(uint64_t idx) {
+  LLDB_INSTRUMENT_VA(this, idx);
+
+  SBAddressRange sb_addr_range;
+  (*sb_addr_range.m_opaque_up) = m_opaque_up->GetAddressRangeAtIndex(idx);
+  return sb_addr_range;
+}
+
+void SBAddressRangeList::Clear() {
+  LLDB_INSTRUMENT_VA(this);
+
+  m_opaque_up->Clear();
+}
+
+void SBAddressRangeList::Append(const SBAddressRange &sb_addr_range) {
+  LLDB_INSTRUMENT_VA(this, sb_addr_range);
+
+  m_opaque_up->Append(*sb_addr_range.m_opaque_up);
+}
+
+void SBAddressRangeList::Append(const SBAddressRangeList &sb_addr_range_list) {
+  LLDB_INSTRUMENT_VA(this, sb_addr_range_list);
+
+  m_opaque_up->Append(*sb_addr_range_list.m_opaque_up);
+}
+
+bool SBAddressRangeList::GetDescription(SBStream &description,
+                                        const SBTarget &target) {
+  LLDB_INSTRUMENT_VA(this, description, target);
+
+  const uint32_t num_ranges = GetSize();
+  bool is_first = true;
+  Stream &stream = description.ref();
+  stream << "[";
+  for (uint32_t i = 0; i < num_ranges; ++i) {
+    if (is_first) {
+      is_first = false;
+    } else {
+      stream.Printf(", ");
+    }
+    GetAddressRangeAtIndex(i).GetDescription(description, target);
+  }
+  stream << "]";
+  return true;
+}
diff --git a/lldb/source/API/SBBlock.cpp b/lldb/source/API/SBBlock.cpp
index 7d7565340836b..2577b14920f06 100644
--- a/lldb/source/API/SBBlock.cpp
+++ b/lldb/source/API/SBBlock.cpp
@@ -13,6 +13,7 @@
 #include "lldb/API/SBStream.h"
 #include "lldb/API/SBValue.h"
 #include "lldb/Core/AddressRange.h"
+#include "lldb/Core/AddressRangeListImpl.h"
 #include "lldb/Core/ValueObjectVariable.h"
 #include "lldb/Symbol/Block.h"
 #include "lldb/Symbol/Function.h"
@@ -219,6 +220,15 @@ lldb::SBAddress SBBlock::GetRangeEndAddress(uint32_t idx) {
   return sb_addr;
 }
 
+lldb::SBAddressRangeList SBBlock::GetRanges() {
+  LLDB_INSTRUMENT_VA(this);
+
+  lldb::SBAddressRangeList sb_ranges;
+  if (m_opaque_ptr)
+    sb_ranges.m_opaque_up->ref() = m_opaque_ptr->GetRanges();
+  return sb_ranges;
+}
+
 uint32_t SBBlock::GetRangeIndexForBlockAddress(lldb::SBAddress block_addr) {
   LLDB_INSTRUMENT_VA(this, block_addr);
 
diff --git a/lldb/source/API/SBFunction.cpp b/lldb/source/API/SBFunction.cpp
index a01c7f79bbd31..6a97352fc2c2f 100644
--- a/lldb/source/API/SBFunction.cpp
+++ b/lldb/source/API/SBFunction.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/API/SBFunction.h"
+#include "lldb/API/SBAddressRange.h"
 #include "lldb/API/SBProcess.h"
 #include "lldb/API/SBStream.h"
 #include "lldb/Core/Disassembler.h"
@@ -160,6 +161,19 @@ SBAddress SBFunction::GetEndAddress() {
   return addr;
 }
 
+lldb::SBAddressRangeList SBFunction::GetRanges() {
+  LLDB_INSTRUMENT_VA(this);
+
+  lldb::SBAddressRangeList ranges;
+  if (m_opaque_ptr) {
+    lldb::SBAddressRange range;
+    (*range.m_opaque_up) = m_opaque_ptr->GetAddressRange();
+    ranges.Append(std::move(range));
+  }
+
+  return ranges;
+}
+
 const char *SBFunction::GetArgumentName(uint32_t arg_idx) {
   LLDB_INSTRUMENT_VA(this, arg_idx);
 
diff --git a/lldb/source/Core/AddressRange.cpp b/lldb/source/Core/AddressRange.cpp
index 1830f2ccd47fe..6cef7e149cd20 100644
--- a/lldb/source/Core/AddressRange.cpp
+++ b/lldb/source/Core/AddressRange.cpp
@@ -14,6 +14,7 @@
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/lldb-defines.h"
+#include "lldb/lldb-types.h"
 
 #include "llvm/Support/Compiler.h"
 
@@ -145,6 +146,10 @@ void AddressRange::Clear() {
   m_byte_size = 0;
 }
 
+bool AddressRange::IsValid() const {
+  return m_base_addr.IsValid() && (m_byte_size > 0);
+}
+
 bool AddressRange::Dump(Stream *s, Target *target, Address::DumpStyle style,
                         Address::DumpStyle fallback_style) const {
   addr_t vmaddr = LLDB_INVALID_ADDRESS;
@@ -203,3 +208,41 @@ void AddressRange::DumpDebug(Stream *s) const {
             static_cast<void *>(m_base_addr.GetSection().get()),
             m_base_addr.GetOffset(), GetByteSize());
 }
+
+bool AddressRange::GetDescription(Stream *s, Target *target) const {
+  addr_t start_addr = m_base_addr.GetLoadAddress(target);
+  if (start_addr != LLDB_INVALID_ADDRESS) {
+    // We have a valid target and the address was resolved, or we have a base
+    // address with no section. Just print out a raw address range: [<addr>,
+    // <addr>)
+    s->Printf("[0x%" PRIx64 "-0x%" PRIx64 ")", start_addr,
+              start_addr + GetByteSize());
+    return true;
+  }
+
+  // Either no target or the address wasn't resolved, print as
+  // <module>[<file-addr>-<file-addr>)
+  const char *file_name = "";
+  const auto section_sp = m_base_addr.GetSection();
+  if (section_sp) {
+    if (const auto object_file = section_sp->GetObjectFile())
+      file_name = object_file->GetFileSpec().GetFilename().AsCString();
+  }
+  start_addr = m_base_addr.GetFileAddress();
+  const addr_t end_addr = (start_addr == LLDB_INVALID_ADDRESS)
+                              ? LLDB_INVALID_ADDRESS
+                              : start_addr + GetByteSize();
+  s->Printf("%s[0x%" PRIx64 "-0x%" PRIx64 ")", file_name, start_addr, end_addr);
+  return true;
+}
+
+bool AddressRange::operator==(const AddressRange &rhs) {
+  if (!IsValid() || !rhs.IsValid())
+    return false;
+  return m_base_addr == rhs.GetBaseAddress() &&
+         m_byte_size == rhs.GetByteSize();
+}
+
+bool AddressRange::operator!=(const AddressRange &rhs) {
+  return !(*this == rhs);
+}
diff --git a/lldb/source/Core/AddressRangeListImpl.cpp b/lldb/source/Core/AddressRangeListImpl.cpp
new file mode 100644
index 0000000000000..d405cf0fa3ec3
--- /dev/null
+++ b/lldb/source/Core/AddressRangeListImpl.cpp
@@ -0,0 +1,50 @@
+//===-- AddressRangeListImpl.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Core/AddressRangeListImpl.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+AddressRangeListImpl::AddressRangeListImpl() : m_ranges() {}
+
+AddressRangeListImpl &
+AddressRangeListImpl::operator=(const AddressRangeListImpl &rhs) {
+  if (this == &rhs)
+    return *this;
+  m_ranges = rhs.m_ranges;
+  return *this;
+}
+
+size_t AddressRangeListImpl::GetSize() const { return m_ranges.size(); }
+
+void AddressRangeListImpl::Reserve(size_t capacity) {
+  m_ranges.reserve(capacity);
+}
+
+void AddressRangeListImpl::Append(const AddressRange &sb_region) {
+  m_ranges.emplace_back(sb_region);
+}
+
+void AddressRangeListImpl::Append(const AddressRangeListImpl &list) {
+  Reserve(GetSize() + list.GetSize());
+
+  for (const auto &range : list.m_ranges)
+    Append(range);
+}
+
+void AddressRangeListImpl::Clear() { m_ranges.clear(); }
+
+lldb_private::AddressRange
+AddressRangeListImpl::GetAddressRangeAtIndex(size_t index) {
+  if (index >= GetSize())
+    return AddressRange();
+  return m_ranges[index];
+}
+
+AddressRanges &AddressRangeListImpl::ref() { return m_ranges; }
diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt
index f24dbbd45a8e8..dbc620b91b1ed 100644
--- a/lldb/source/Core/CMakeLists.txt
+++ b/lldb/source/Core/CMakeLists.txt
@@ -20,6 +20,7 @@ endif()
 add_lldb_library(lldbCore
   Address.cpp
   AddressRange.cpp
+  AddressRangeListImpl.cpp
   AddressResolver.cpp
   AddressResolverFileLine.cpp
   Communication.cpp
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
index ca582cb1d5a46..ddaa7a8a597b4 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCTypeEncodingParser.cpp
@@ -13,6 +13,8 @@
 #include "lldb/Symbol/CompilerType.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
 #include "lldb/Utility/StringLexer.h"
 
 #include "clang/Basic/TargetInfo.h"
@@ -234,12 +236,15 @@ clang::QualType AppleObjCTypeEncodingParser::BuildObjCObjectPointerType(
 
     auto types = decl_vendor->FindTypes(ConstString(name), /*max_matches*/ 1);
 
-    // The user can forward-declare something that has no definition.  The runtime
-    // doesn't prohibit this at all. This is a rare and very weird case.  We keep
-    // this assert in debug builds so we catch other weird cases.
-    lldbassert(!types.empty());
-    if (types.empty())
+    if (types.empty()) {
+      // The user can forward-declare something that has no definition. The
+      // runtime doesn't prohibit this at all. This is a rare and very weird
+      // case. Assert assert in debug builds so we catch other weird cases.
+      assert(false && "forward declaration without definition");
+      LLDB_LOG(GetLog(LLDBLog::Types),
+               "forward declaration without definition: {0}", name);
       return ast_ctx.getObjCIdType();
+    }
 
     return ClangUtil::GetQualType(types.front().GetPointerType());
   } else {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp
index 33537df4f5076..1703597a7cd2f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/AppleDWARFIndex.cpp
@@ -284,8 +284,12 @@ void AppleDWARFIndex::GetFunctions(
   for (const auto &entry : m_apple_names_up->equal_range(name)) {
     DIERef die_ref(std::nullopt, DIERef::Section::DebugInfo,
                    *entry.getDIESectionOffset());
-    if (!ProcessFunctionDIE(lookup_info, die_ref, dwarf, parent_decl_ctx,
-                            callback))
+    DWARFDIE die = dwarf.GetDIE(die_ref);
+    if (!die) {
+      ReportInvalidDIERef(die_ref, name);
+      continue;
+    }
+    if (!ProcessFunctionDIE(lookup_info, die, parent_decl_ctx, callback))
       return;
   }
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
index 66db396279e06..e144cf0f9bd94 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h
@@ -60,6 +60,8 @@ class DWARFASTParser {
 
   virtual ConstString GetDIEClassTemplateParams(const DWARFDIE &die) = 0;
 
+  virtual lldb_private::Type *FindDefinitionTypeForDIE(const DWARFDIE &die) = 0;
+
   static std::optional<SymbolFile::ArrayInfo>
   ParseChildArrayInfo(const DWARFDIE &parent_die,
                       const ExecutionContext *exe_ctx = nullptr);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index f8101aba5c627..e0b1b430b266f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -154,6 +154,26 @@ static bool TagIsRecordType(dw_tag_t tag) {
   }
 }
 
+static bool IsForwardDeclaration(const DWARFDIE &die,
+                                 const ParsedDWARFTypeAttributes &attrs,
+                                 LanguageType cu_language) {
+  if (attrs.is_forward_declaration)
+    return true;
+
+  // Work around an issue with clang at the moment where forward
+  // declarations for objective C classes are emitted as:
+  //  DW_TAG_structure_type [2]
+  //  DW_AT_name( "ForwardObjcClass" )
+  //  DW_AT_byte_size( 0x00 )
+  //  DW_AT_decl_file( "..." )
+  //  DW_AT_decl_line( 1 )
+  //
+  // Note that there is no DW_AT_declaration and there are no children,
+  // and the byte size is zero.
+  return attrs.byte_size && *attrs.byte_size == 0 && attrs.name &&
+         !die.HasChildren() && cu_language == eLanguageTypeObjC;
+}
+
 TypeSP DWARFASTParserClang::ParseTypeFromClangModule(const SymbolContext &sc,
                                                      const DWARFDIE &die,
                                                      Log *log) {
@@ -249,11 +269,9 @@ static void ForcefullyCompleteType(CompilerType type) {
 /// This function serves a similar purpose as RequireCompleteType above, but it
 /// avoids completing the type if it is not immediately necessary. It only
 /// ensures we _can_ complete the type later.
-static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
-                                           ClangASTImporter &ast_importer,
-                                           clang::DeclContext *decl_ctx,
-                                           DWARFDIE die,
-                                           const char *type_name_cstr) {
+void DWARFASTParserClang::PrepareContextToReceiveMembers(
+    clang::DeclContext *decl_ctx, const DWARFDIE &decl_ctx_die,
+    const DWARFDIE &die, const char *type_name_cstr) {
   auto *tag_decl_ctx = clang::dyn_cast<clang::TagDecl>(decl_ctx);
   if (!tag_decl_ctx)
     return; // Non-tag context are always ready.
@@ -268,7 +286,8 @@ static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
   // gmodules case), we can complete the type by doing a full import.
 
   // If this type was not imported from an external AST, there's nothing to do.
-  CompilerType type = ast.GetTypeForDecl(tag_decl_ctx);
+  CompilerType type = m_ast.GetTypeForDecl(tag_decl_ctx);
+  ClangASTImporter &ast_importer = GetClangASTImporter();
   if (type && ast_importer.CanImport(type)) {
     auto qual_type = ClangUtil::GetQualType(type);
     if (ast_importer.RequireCompleteType(qual_type))
@@ -279,6 +298,13 @@ static void PrepareContextToReceiveMembers(TypeSystemClang &ast,
         type_name_cstr ? type_name_cstr : "", die.GetOffset());
   }
 
+  // By searching for the definition DIE of the decl_ctx type, we will either:
+  // 1. Found the the definition DIE and start its definition with
+  // TypeSystemClang::StartTagDeclarationDefinition.
+  // 2. Unable to find it, then need to forcefully complete it.
+  FindDefinitionTypeForDIE(decl_ctx_die);
+  if (tag_decl_ctx->isCompleteDefinition() || tag_decl_ctx->isBeingDefined())
+    return;
   // We don't have a type definition and/or the import failed. We must
   // forcefully complete the type to avoid crashes.
   ForcefullyCompleteType(type);
@@ -620,10 +646,11 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
   if (tag == DW_TAG_typedef) {
     // DeclContext will be populated when the clang type is materialized in
     // Type::ResolveCompilerType.
-    PrepareContextToReceiveMembers(
-        m_ast, GetClangASTImporter(),
-        GetClangDeclContextContainingDIE(die, nullptr), die,
-        attrs.name.GetCString());
+    DWARFDIE decl_ctx_die;
+    clang::DeclContext *decl_ctx =
+        GetClangDeclContextContainingDIE(die, &decl_ctx_die);
+    PrepareContextToReceiveMembers(decl_ctx, decl_ctx_die, die,
+                                   attrs.name.GetCString());
 
     if (attrs.type.IsValid()) {
       // Try to parse a typedef from the (DWARF embedded in the) Clang
@@ -1103,32 +1130,6 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
         // struct and see if this is actually a C++ method
         Type *class_type = dwarf->ResolveType(decl_ctx_die);
         if (class_type) {
-          if (class_type->GetID() != decl_ctx_die.GetID() ||
-              IsClangModuleFwdDecl(decl_ctx_die)) {
-
-            // We uniqued the parent class of this function to another
-            // class so we now need to associate all dies under
-            // "decl_ctx_die" to DIEs in the DIE for "class_type"...
-            DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID());
-
-            if (class_type_die) {
-              std::vector<DWARFDIE> failures;
-
-              CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die,
-                                         class_type, failures);
-
-              // FIXME do something with these failures that's
-              // smarter than just dropping them on the ground.
-              // Unfortunately classes don't like having stuff added
-              // to them after their definitions are complete...
-
-              Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
-              if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) {
-                return type_ptr->shared_from_this();
-              }
-            }
-          }
-
           if (attrs.specification.IsValid()) {
             // We have a specification which we are going to base our
             // function prototype off of, so we need this type to be
@@ -1263,6 +1264,39 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
               }
             }
           }
+          // By here, we should have already completed the c++ class_type
+          // because if either specification or abstract_origin is present, we
+          // call GetClangDeclContextForDIE to resolve the DW_TAG_subprogram
+          // refered by this one until we reached the DW_TAG_subprogram without
+          // specification or abstract_origin (the else branch above). Then the
+          // above GetFullCompilerType() will complete the class_type if it's
+          // not completed yet. After that, we will have the mapping from DIEs
+          // in class_type_die to DeclContexts in m_die_to_decl_ctx.
+          if (class_type->GetID() != decl_ctx_die.GetID() ||
+              IsClangModuleFwdDecl(decl_ctx_die)) {
+
+            // We uniqued the parent class of this function to another
+            // class so we now need to associate all dies under
+            // "decl_ctx_die" to DIEs in the DIE for "class_type"...
+            DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID());
+
+            if (class_type_die) {
+              std::vector<DWARFDIE> failures;
+
+              CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die,
+                                         class_type, failures);
+
+              // FIXME do something with these failures that's
+              // smarter than just dropping them on the ground.
+              // Unfortunately classes don't like having stuff added
+              // to them after their definitions are complete...
+
+              Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
+              if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) {
+                return type_ptr->shared_from_this();
+              }
+            }
+          }
         }
       }
     }
@@ -1635,6 +1669,93 @@ DWARFASTParserClang::GetCPlusPlusQualifiedName(const DWARFDIE &die) {
   return qualified_name;
 }
 
+lldb_private::Type *
+DWARFASTParserClang::FindDefinitionTypeForDIE(const DWARFDIE &die) {
+  SymbolFileDWARF *dwarf = die.GetDWARF();
+  ParsedDWARFTypeAttributes attrs(die);
+  bool is_forward_declaration = IsForwardDeclaration(
+      die, attrs, SymbolFileDWARF::GetLanguage(*die.GetCU()));
+  if (!is_forward_declaration)
+    return dwarf->GetDIEToType()[die.GetDIE()];
+
+  const dw_tag_t tag = die.Tag();
+  TypeSP type_sp;
+  Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
+  if (log) {
+    dwarf->GetObjectFile()->GetModule()->LogMessage(
+        log,
+        "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
+        "forward declaration DIE, trying to find definition DIE",
+        static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
+        attrs.name.GetCString());
+  }
+  // We haven't parse definition die for this type, starting to search for it.
+  // After we found the definition die, the GetDeclarationDIEToDefinitionDIE()
+  // map will have the new mapping from this declaration die to definition die.
+  if (attrs.class_language == eLanguageTypeObjC ||
+      attrs.class_language == eLanguageTypeObjC_plus_plus) {
+    if (!attrs.is_complete_objc_class &&
+        die.Supports_DW_AT_APPLE_objc_complete_type()) {
+      // We have a valid eSymbolTypeObjCClass class symbol whose name
+      // matches the current objective C class that we are trying to find
+      // and this DIE isn't the complete definition (we checked
+      // is_complete_objc_class above and know it is false), so the real
+      // definition is in here somewhere
+      type_sp =
+          dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true);
+
+      if (!type_sp) {
+        SymbolFileDWARFDebugMap *debug_map_symfile =
+            dwarf->GetDebugMapSymfile();
+        if (debug_map_symfile) {
+          // We weren't able to find a full declaration in this DWARF,
+          // see if we have a declaration anywhere else...
+          type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE(
+              die, attrs.name, true);
+        }
+      }
+
+      if (type_sp && log) {
+        dwarf->GetObjectFile()->GetModule()->LogMessage(
+            log,
+            "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an "
+            "incomplete objc type, complete type is {5:x8}",
+            static_cast<void *>(this), die.GetOffset(),
+            DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
+            type_sp->GetID());
+      }
+    }
+  }
+
+  type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die);
+  if (!type_sp) {
+    SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile();
+    if (debug_map_symfile) {
+      // We weren't able to find a full declaration in this DWARF, see
+      // if we have a declaration anywhere else...
+      type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die);
+    }
+    if (type_sp && log) {
+      dwarf->GetObjectFile()->GetModule()->LogMessage(
+          log,
+          "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
+          "forward declaration, complete type is {4:x8}",
+          static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
+          attrs.name.GetCString(), type_sp->GetID());
+    }
+  }
+
+  if (!type_sp && log) {
+    dwarf->GetObjectFile()->GetModule()->LogMessage(
+        log,
+        "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a "
+        "forward declaration, unable to find definition DIE for it",
+        static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
+        attrs.name.GetCString());
+  }
+  return type_sp.get();
+}
+
 TypeSP
 DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
                                            const DWARFDIE &die,
@@ -1646,14 +1767,10 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
   LanguageType cu_language = SymbolFileDWARF::GetLanguage(*die.GetCU());
   Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups);
 
-  // UniqueDWARFASTType is large, so don't create a local variables on the
-  // stack, put it on the heap. This function is often called recursively and
-  // clang isn't good at sharing the stack space for variables in different
-  // blocks.
-  auto unique_ast_entry_up = std::make_unique<UniqueDWARFASTType>();
-
   ConstString unique_typename(attrs.name);
   Declaration unique_decl(attrs.decl);
+  uint64_t byte_size = attrs.byte_size.value_or(0);
+  attrs.is_forward_declaration = IsForwardDeclaration(die, attrs, cu_language);
 
   if (attrs.name) {
     if (Language::LanguageIsCPlusPlus(cu_language)) {
@@ -1666,14 +1783,42 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       unique_decl.Clear();
     }
 
-    if (dwarf->GetUniqueDWARFASTTypeMap().Find(
-            unique_typename, die, unique_decl, attrs.byte_size.value_or(-1),
-            *unique_ast_entry_up)) {
-      type_sp = unique_ast_entry_up->m_type_sp;
+    if (UniqueDWARFASTType *unique_ast_entry_type =
+            dwarf->GetUniqueDWARFASTTypeMap().Find(
+                unique_typename, die, unique_decl, byte_size,
+                attrs.is_forward_declaration)) {
+      type_sp = unique_ast_entry_type->m_type_sp;
       if (type_sp) {
         dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
         LinkDeclContextToDIE(
-            GetCachedClangDeclContextForDIE(unique_ast_entry_up->m_die), die);
+            GetCachedClangDeclContextForDIE(unique_ast_entry_type->m_die), die);
+        if (!attrs.is_forward_declaration) {
+          // If the DIE being parsed in this function is a definition and the
+          // entry in the map is a declaration, then we need to update the entry
+          // to point to the definition DIE.
+          if (unique_ast_entry_type->m_is_forward_declaration) {
+            unique_ast_entry_type->m_die = die;
+            unique_ast_entry_type->m_byte_size = byte_size;
+            unique_ast_entry_type->m_declaration = unique_decl;
+            unique_ast_entry_type->m_is_forward_declaration = false;
+            // Need to update Type ID to refer to the definition DIE. because
+            // it's used in ParseSubroutine to determine if we need to copy cxx
+            // method types from a declaration DIE to this definition DIE.
+            type_sp->SetID(die.GetID());
+            clang_type = type_sp->GetForwardCompilerType();
+            if (attrs.class_language != eLanguageTypeObjC &&
+                attrs.class_language != eLanguageTypeObjC_plus_plus)
+              TypeSystemClang::StartTagDeclarationDefinition(clang_type);
+
+            CompilerType compiler_type_no_qualifiers =
+                ClangUtil::RemoveFastQualifiers(clang_type);
+            auto result = dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace(
+                compiler_type_no_qualifiers.GetOpaqueQualType(),
+                *die.GetDIERef());
+            if (!result.second)
+              result.first->second = *die.GetDIERef();
+          }
+        }
         return type_sp;
       }
     }
@@ -1695,125 +1840,21 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
     default_accessibility = eAccessPrivate;
   }
 
-  if (attrs.byte_size && *attrs.byte_size == 0 && attrs.name &&
-      !die.HasChildren() && cu_language == eLanguageTypeObjC) {
-    // Work around an issue with clang at the moment where forward
-    // declarations for objective C classes are emitted as:
-    //  DW_TAG_structure_type [2]
-    //  DW_AT_name( "ForwardObjcClass" )
-    //  DW_AT_byte_size( 0x00 )
-    //  DW_AT_decl_file( "..." )
-    //  DW_AT_decl_line( 1 )
-    //
-    // Note that there is no DW_AT_declaration and there are no children,
-    // and the byte size is zero.
-    attrs.is_forward_declaration = true;
-  }
-
-  if (attrs.class_language == eLanguageTypeObjC ||
-      attrs.class_language == eLanguageTypeObjC_plus_plus) {
-    if (!attrs.is_complete_objc_class &&
-        die.Supports_DW_AT_APPLE_objc_complete_type()) {
-      // We have a valid eSymbolTypeObjCClass class symbol whose name
-      // matches the current objective C class that we are trying to find
-      // and this DIE isn't the complete definition (we checked
-      // is_complete_objc_class above and know it is false), so the real
-      // definition is in here somewhere
-      type_sp =
-          dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true);
-
-      if (!type_sp) {
-        SymbolFileDWARFDebugMap *debug_map_symfile =
-            dwarf->GetDebugMapSymfile();
-        if (debug_map_symfile) {
-          // We weren't able to find a full declaration in this DWARF,
-          // see if we have a declaration anywhere else...
-          type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE(
-              die, attrs.name, true);
-        }
-      }
-
-      if (type_sp) {
-        if (log) {
-          dwarf->GetObjectFile()->GetModule()->LogMessage(
-              log,
-              "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an "
-              "incomplete objc type, complete type is {5:x8}",
-              static_cast<void *>(this), die.GetOffset(),
-              DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
-              type_sp->GetID());
-        }
-
-        // We found a real definition for this type elsewhere so lets use
-        // it and cache the fact that we found a complete type for this
-        // die
-        dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-        return type_sp;
-      }
-    }
-  }
-
   if (attrs.is_forward_declaration) {
-    // We have a forward declaration to a type and we need to try and
-    // find a full declaration. We look in the current type index just in
-    // case we have a forward declaration followed by an actual
-    // declarations in the DWARF. If this fails, we need to look
-    // elsewhere...
-    if (log) {
-      dwarf->GetObjectFile()->GetModule()->LogMessage(
-          log,
-          "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a "
-          "forward declaration, trying to find complete type",
-          static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag),
-          tag, attrs.name.GetCString());
-    }
-
     // See if the type comes from a Clang module and if so, track down
     // that type.
     type_sp = ParseTypeFromClangModule(sc, die, log);
     if (type_sp)
       return type_sp;
-
-    // type_sp = FindDefinitionTypeForDIE (dwarf_cu, die,
-    // type_name_const_str);
-    type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die);
-
-    if (!type_sp) {
-      SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile();
-      if (debug_map_symfile) {
-        // We weren't able to find a full declaration in this DWARF, see
-        // if we have a declaration anywhere else...
-        type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die);
-      }
-    }
-
-    if (type_sp) {
-      if (log) {
-        dwarf->GetObjectFile()->GetModule()->LogMessage(
-            log,
-            "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a "
-            "forward declaration, complete type is {5:x8}",
-            static_cast<void *>(this), die.GetOffset(),
-            DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(),
-            type_sp->GetID());
-      }
-
-      // We found a real definition for this type elsewhere so lets use
-      // it and cache the fact that we found a complete type for this die
-      dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-      clang::DeclContext *defn_decl_ctx =
-          GetCachedClangDeclContextForDIE(dwarf->GetDIE(type_sp->GetID()));
-      if (defn_decl_ctx)
-        LinkDeclContextToDIE(defn_decl_ctx, die);
-      return type_sp;
-    }
   }
+
   assert(tag_decl_kind != -1);
   UNUSED_IF_ASSERT_DISABLED(tag_decl_kind);
-  bool clang_type_was_created = false;
-  clang::DeclContext *decl_ctx = GetClangDeclContextContainingDIE(die, nullptr);
+  DWARFDIE decl_ctx_die;
+  clang::DeclContext *decl_ctx =
+      GetClangDeclContextContainingDIE(die, &decl_ctx_die);
 
-  PrepareContextToReceiveMembers(m_ast, GetClangASTImporter(), decl_ctx, die,
+  PrepareContextToReceiveMembers(decl_ctx, decl_ctx_die, die,
                                  attrs.name.GetCString());
 
   if (attrs.accessibility == eAccessNone && decl_ctx) {
@@ -1852,20 +1893,17 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
             tag_decl_kind, template_param_infos);
     clang_type =
         m_ast.CreateClassTemplateSpecializationType(class_specialization_decl);
-    clang_type_was_created = true;
 
     m_ast.SetMetadata(class_template_decl, metadata);
     m_ast.SetMetadata(class_specialization_decl, metadata);
   }
 
-  if (!clang_type_was_created) {
-    clang_type_was_created = true;
+  if (!clang_type) {
     clang_type = m_ast.CreateRecordType(
         decl_ctx, GetOwningClangModule(die), attrs.accessibility,
         attrs.name.GetCString(), tag_decl_kind, attrs.class_language, &metadata,
         attrs.exports_symbols);
   }
-
   // Store a forward declaration to this class type in case any
   // parameters in any class methods need it for the clang types for
   // function prototypes.
@@ -1876,13 +1914,19 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       Type::ResolveState::Forward,
       TypePayloadClang(OptionalClangModuleID(), attrs.is_complete_objc_class));
 
+  // UniqueDWARFASTType is large, so don't create a local variables on the
+  // stack, put it on the heap. This function is often called recursively and
+  // clang isn't good at sharing the stack space for variables in different
+  // blocks.
+  auto unique_ast_entry_up = std::make_unique<UniqueDWARFASTType>();
   // Add our type to the unique type map so we don't end up creating many
   // copies of the same type over and over in the ASTContext for our
   // module
   unique_ast_entry_up->m_type_sp = type_sp;
   unique_ast_entry_up->m_die = die;
   unique_ast_entry_up->m_declaration = unique_decl;
-  unique_ast_entry_up->m_byte_size = attrs.byte_size.value_or(0);
+  unique_ast_entry_up->m_byte_size = byte_size;
+  unique_ast_entry_up->m_is_forward_declaration = attrs.is_forward_declaration;
   dwarf->GetUniqueDWARFASTTypeMap().Insert(unique_typename,
                                            *unique_ast_entry_up);
 
@@ -1923,7 +1967,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
           GetClangASTImporter().SetRecordLayout(record_decl, layout);
         }
       }
-    } else if (clang_type_was_created) {
+    } else {
       // Start the definition if the class is not objective C since the
       // underlying decls respond to isCompleteDefinition(). Objective
       // C decls don't respond to isCompleteDefinition() so we can't
@@ -1935,26 +1979,21 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
       if (attrs.class_language != eLanguageTypeObjC &&
           attrs.class_language != eLanguageTypeObjC_plus_plus)
         TypeSystemClang::StartTagDeclarationDefinition(clang_type);
-
-      // Leave this as a forward declaration until we need to know the
-      // details of the type. lldb_private::Type will automatically call
-      // the SymbolFile virtual function
-      // "SymbolFileDWARF::CompleteType(Type *)" When the definition
-      // needs to be defined.
-      assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count(
-                 ClangUtil::RemoveFastQualifiers(clang_type)
-                     .GetOpaqueQualType()) &&
-             "Type already in the forward declaration map!");
-      // Can't assume m_ast.GetSymbolFile() is actually a
-      // SymbolFileDWARF, it can be a SymbolFileDWARFDebugMap for Apple
-      // binaries.
-      dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace(
-          ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(),
-          *die.GetDIERef());
-      m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true);
     }
   }
 
+  // If this is a declaration DIE, leave this as a forward declaration until we
+  // need to know the details of the type. lldb_private::Type will automatically
+  // call the SymbolFile virtual function "SymbolFileDWARF::CompleteType(Type
+  // *)" When the definition needs to be defined.
+  assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count(
+             ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType()) &&
+         "Type already in the forward declaration map!");
+  dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace(
+      ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(),
+      *die.GetDIERef());
+  m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true);
+
   // If we made a clang type, set the trivial abi if applicable: We only
   // do this for pass by value - which implies the Trivial ABI. There
   // isn't a way to assert that something that would normally be pass by
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index 8d4af203bb287..853b8ccc30369 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -42,40 +42,40 @@ struct ParsedDWARFTypeAttributes;
 
 class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
 public:
+  typedef lldb_private::plugin::dwarf::DWARFDIE DWARFDIE;
+
   DWARFASTParserClang(lldb_private::TypeSystemClang &ast);
 
   ~DWARFASTParserClang() override;
 
   // DWARFASTParser interface.
-  lldb::TypeSP
-  ParseTypeFromDWARF(const lldb_private::SymbolContext &sc,
-                     const lldb_private::plugin::dwarf::DWARFDIE &die,
-                     bool *type_is_new_ptr) override;
+  lldb::TypeSP ParseTypeFromDWARF(const lldb_private::SymbolContext &sc,
+                                  const DWARFDIE &die,
+                                  bool *type_is_new_ptr) override;
 
-  lldb_private::ConstString ConstructDemangledNameFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::ConstString
+  ConstructDemangledNameFromDWARF(const DWARFDIE &die) override;
 
   lldb_private::Function *
   ParseFunctionFromDWARF(lldb_private::CompileUnit &comp_unit,
-                         const lldb_private::plugin::dwarf::DWARFDIE &die,
+                         const DWARFDIE &die,
                          const lldb_private::AddressRange &func_range) override;
 
   bool
-  CompleteTypeFromDWARF(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                        lldb_private::Type *type,
+  CompleteTypeFromDWARF(const DWARFDIE &die, lldb_private::Type *type,
                         lldb_private::CompilerType &compiler_type) override;
 
-  lldb_private::CompilerDecl GetDeclForUIDFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::CompilerDecl
+  GetDeclForUIDFromDWARF(const DWARFDIE &die) override;
 
   void EnsureAllDIEsInDeclContextHaveBeenParsed(
       lldb_private::CompilerDeclContext decl_context) override;
 
-  lldb_private::CompilerDeclContext GetDeclContextForUIDFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::CompilerDeclContext
+  GetDeclContextForUIDFromDWARF(const DWARFDIE &die) override;
 
-  lldb_private::CompilerDeclContext GetDeclContextContainingUIDFromDWARF(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::CompilerDeclContext
+  GetDeclContextContainingUIDFromDWARF(const DWARFDIE &die) override;
 
   lldb_private::ClangASTImporter &GetClangASTImporter();
 
@@ -105,8 +105,13 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// \return A string, including surrounding '<>', of the template parameters.
   /// If the DIE's name already has '<>', returns an empty ConstString because
   /// it's assumed that the caller is using the DIE name anyway.
-  lldb_private::ConstString GetDIEClassTemplateParams(
-      const lldb_private::plugin::dwarf::DWARFDIE &die) override;
+  lldb_private::ConstString
+  GetDIEClassTemplateParams(const DWARFDIE &die) override;
+
+  // Searching for definition DIE for the given DIE and return the type
+  // associated with the definition DIE, or nullptr if definition DIE is not
+  // found.
+  lldb_private::Type *FindDefinitionTypeForDIE(const DWARFDIE &die) override;
 
 protected:
   /// Protected typedefs and members.
@@ -118,8 +123,7 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
       const lldb_private::plugin::dwarf::DWARFDebugInfoEntry *,
       clang::DeclContext *>
       DIEToDeclContextMap;
-  typedef std::multimap<const clang::DeclContext *,
-                        const lldb_private::plugin::dwarf::DWARFDIE>
+  typedef std::multimap<const clang::DeclContext *, const DWARFDIE>
       DeclContextToDIEMap;
   typedef llvm::DenseMap<
       const lldb_private::plugin::dwarf::DWARFDebugInfoEntry *,
@@ -137,14 +141,11 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   std::unique_ptr<lldb_private::ClangASTImporter> m_clang_ast_importer_up;
   /// @}
 
-  clang::DeclContext *
-  GetDeclContextForBlock(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetDeclContextForBlock(const DWARFDIE &die);
 
-  clang::BlockDecl *
-  ResolveBlockDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::BlockDecl *ResolveBlockDIE(const DWARFDIE &die);
 
-  clang::NamespaceDecl *
-  ResolveNamespaceDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::NamespaceDecl *ResolveNamespaceDIE(const DWARFDIE &die);
 
   /// Returns the namespace decl that a DW_TAG_imported_declaration imports.
   ///
@@ -155,96 +156,86 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   ///          'die' imports. If the imported entity is not a namespace
   ///          or another import declaration, returns nullptr. If an error
   ///          occurs, returns nullptr.
-  clang::NamespaceDecl *ResolveImportedDeclarationDIE(
-      const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::NamespaceDecl *ResolveImportedDeclarationDIE(const DWARFDIE &die);
 
-  bool ParseTemplateDIE(const lldb_private::plugin::dwarf::DWARFDIE &die,
+  bool ParseTemplateDIE(const DWARFDIE &die,
                         lldb_private::TypeSystemClang::TemplateParameterInfos
                             &template_param_infos);
 
   bool ParseTemplateParameterInfos(
-      const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+      const DWARFDIE &parent_die,
       lldb_private::TypeSystemClang::TemplateParameterInfos
           &template_param_infos);
 
-  std::string
-  GetCPlusPlusQualifiedName(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  std::string GetCPlusPlusQualifiedName(const DWARFDIE &die);
 
   bool ParseChildMembers(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      lldb_private::CompilerType &class_compiler_type,
+      const DWARFDIE &die, lldb_private::CompilerType &class_compiler_type,
       std::vector<std::unique_ptr<clang::CXXBaseSpecifier>> &base_classes,
-      std::vector<lldb_private::plugin::dwarf::DWARFDIE> &member_function_dies,
-      std::vector<lldb_private::plugin::dwarf::DWARFDIE> &contained_type_dies,
+      std::vector<DWARFDIE> &member_function_dies,
+      std::vector<DWARFDIE> &contained_type_dies,
       DelayedPropertyList &delayed_properties,
       const lldb::AccessType default_accessibility,
       lldb_private::ClangASTImporter::LayoutInfo &layout_info);
 
   size_t
   ParseChildParameters(clang::DeclContext *containing_decl_ctx,
-                       const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-                       bool skip_artificial, bool &is_static, bool &is_variadic,
+                       const DWARFDIE &parent_die, bool skip_artificial,
+                       bool &is_static, bool &is_variadic,
                        bool &has_template_params,
                        std::vector<lldb_private::CompilerType> &function_args,
                        std::vector<clang::ParmVarDecl *> &function_param_decls,
                        unsigned &type_quals);
 
-  size_t ParseChildEnumerators(
-      lldb_private::CompilerType &compiler_type, bool is_signed,
-      uint32_t enumerator_byte_size,
-      const lldb_private::plugin::dwarf::DWARFDIE &parent_die);
+  size_t ParseChildEnumerators(lldb_private::CompilerType &compiler_type,
+                               bool is_signed, uint32_t enumerator_byte_size,
+                               const DWARFDIE &parent_die);
 
   /// Parse a structure, class, or union type DIE.
-  lldb::TypeSP
-  ParseStructureLikeDIE(const lldb_private::SymbolContext &sc,
-                        const lldb_private::plugin::dwarf::DWARFDIE &die,
-                        ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseStructureLikeDIE(const lldb_private::SymbolContext &sc,
+                                     const DWARFDIE &die,
+                                     ParsedDWARFTypeAttributes &attrs);
 
-  clang::Decl *
-  GetClangDeclForDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::Decl *GetClangDeclForDIE(const DWARFDIE &die);
 
-  clang::DeclContext *
-  GetClangDeclContextForDIE(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetClangDeclContextForDIE(const DWARFDIE &die);
 
-  clang::DeclContext *GetClangDeclContextContainingDIE(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      lldb_private::plugin::dwarf::DWARFDIE *decl_ctx_die);
-  lldb_private::OptionalClangModuleID
-  GetOwningClangModule(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetClangDeclContextContainingDIE(const DWARFDIE &die,
+                                                       DWARFDIE *decl_ctx_die);
+  lldb_private::OptionalClangModuleID GetOwningClangModule(const DWARFDIE &die);
 
-  bool CopyUniqueClassMethodTypes(
-      const lldb_private::plugin::dwarf::DWARFDIE &src_class_die,
-      const lldb_private::plugin::dwarf::DWARFDIE &dst_class_die,
-      lldb_private::Type *class_type,
-      std::vector<lldb_private::plugin::dwarf::DWARFDIE> &failures);
+  bool CopyUniqueClassMethodTypes(const DWARFDIE &src_class_die,
+                                  const DWARFDIE &dst_class_die,
+                                  lldb_private::Type *class_type,
+                                  std::vector<DWARFDIE> &failures);
 
-  clang::DeclContext *GetCachedClangDeclContextForDIE(
-      const lldb_private::plugin::dwarf::DWARFDIE &die);
+  clang::DeclContext *GetCachedClangDeclContextForDIE(const DWARFDIE &die);
 
-  void LinkDeclContextToDIE(clang::DeclContext *decl_ctx,
-                            const lldb_private::plugin::dwarf::DWARFDIE &die);
+  void LinkDeclContextToDIE(clang::DeclContext *decl_ctx, const DWARFDIE &die);
 
-  void LinkDeclToDIE(clang::Decl *decl,
-                     const lldb_private::plugin::dwarf::DWARFDIE &die);
+  void LinkDeclToDIE(clang::Decl *decl, const DWARFDIE &die);
 
   /// If \p type_sp is valid, calculate and set its symbol context scope, and
   /// update the type list for its backing symbol file.
   ///
   /// Returns \p type_sp.
-  lldb::TypeSP UpdateSymbolContextScopeForType(
-      const lldb_private::SymbolContext &sc,
-      const lldb_private::plugin::dwarf::DWARFDIE &die, lldb::TypeSP type_sp);
+  lldb::TypeSP
+  UpdateSymbolContextScopeForType(const lldb_private::SymbolContext &sc,
+                                  const DWARFDIE &die, lldb::TypeSP type_sp);
 
   /// Follow Clang Module Skeleton CU references to find a type definition.
-  lldb::TypeSP
-  ParseTypeFromClangModule(const lldb_private::SymbolContext &sc,
-                           const lldb_private::plugin::dwarf::DWARFDIE &die,
-                           lldb_private::Log *log);
+  lldb::TypeSP ParseTypeFromClangModule(const lldb_private::SymbolContext &sc,
+                                        const DWARFDIE &die,
+                                        lldb_private::Log *log);
 
   // Return true if this type is a declaration to a type in an external
   // module.
-  lldb::ModuleSP
-  GetModuleForType(const lldb_private::plugin::dwarf::DWARFDIE &die);
+  lldb::ModuleSP GetModuleForType(const DWARFDIE &die);
+
+  void PrepareContextToReceiveMembers(clang::DeclContext *decl_ctx,
+                                      const DWARFDIE &decl_ctx_die,
+                                      const DWARFDIE &die,
+                                      const char *type_name_cstr);
 
   static bool classof(const DWARFASTParser *Parser) {
     return Parser->GetKind() == Kind::DWARFASTParserClang;
@@ -274,10 +265,8 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
 
   /// Parsed form of all attributes that are relevant for parsing type members.
   struct MemberAttributes {
-    explicit MemberAttributes(
-        const lldb_private::plugin::dwarf::DWARFDIE &die,
-        const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-        lldb::ModuleSP module_sp);
+    explicit MemberAttributes(const DWARFDIE &die, const DWARFDIE &parent_die,
+                              lldb::ModuleSP module_sp);
     const char *name = nullptr;
     /// Indicates how many bits into the word (according to the host endianness)
     /// the low-order bit of the field starts. Can be negative.
@@ -324,15 +313,12 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// created property.
   /// \param delayed_properties The list of delayed properties that the result
   /// will be appended to.
-  void
-  ParseObjCProperty(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                    const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-                    const lldb_private::CompilerType &class_clang_type,
-                    DelayedPropertyList &delayed_properties);
+  void ParseObjCProperty(const DWARFDIE &die, const DWARFDIE &parent_die,
+                         const lldb_private::CompilerType &class_clang_type,
+                         DelayedPropertyList &delayed_properties);
 
   void
-  ParseSingleMember(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                    const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+  ParseSingleMember(const DWARFDIE &die, const DWARFDIE &parent_die,
                     const lldb_private::CompilerType &class_clang_type,
                     lldb::AccessType default_accessibility,
                     lldb_private::ClangASTImporter::LayoutInfo &layout_info,
@@ -350,31 +336,25 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// \param[in] class_clang_type The parent RecordType of the static
   ///                             member this function will create.
   void CreateStaticMemberVariable(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      const MemberAttributes &attrs,
+      const DWARFDIE &die, const MemberAttributes &attrs,
       const lldb_private::CompilerType &class_clang_type);
 
-  bool CompleteRecordType(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                          lldb_private::Type *type,
+  bool CompleteRecordType(const DWARFDIE &die, lldb_private::Type *type,
                           lldb_private::CompilerType &clang_type);
-  bool CompleteEnumType(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                        lldb_private::Type *type,
+  bool CompleteEnumType(const DWARFDIE &die, lldb_private::Type *type,
                         lldb_private::CompilerType &clang_type);
 
-  lldb::TypeSP
-  ParseTypeModifier(const lldb_private::SymbolContext &sc,
-                    const lldb_private::plugin::dwarf::DWARFDIE &die,
-                    ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseTypeModifier(const lldb_private::SymbolContext &sc,
+                                 const DWARFDIE &die,
+                                 ParsedDWARFTypeAttributes &attrs);
   lldb::TypeSP ParseEnum(const lldb_private::SymbolContext &sc,
-                         const lldb_private::plugin::dwarf::DWARFDIE &die,
-                         ParsedDWARFTypeAttributes &attrs);
-  lldb::TypeSP ParseSubroutine(const lldb_private::plugin::dwarf::DWARFDIE &die,
+                         const DWARFDIE &die, ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseSubroutine(const DWARFDIE &die,
                                const ParsedDWARFTypeAttributes &attrs);
-  lldb::TypeSP ParseArrayType(const lldb_private::plugin::dwarf::DWARFDIE &die,
+  lldb::TypeSP ParseArrayType(const DWARFDIE &die,
                               const ParsedDWARFTypeAttributes &attrs);
-  lldb::TypeSP
-  ParsePointerToMemberType(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                           const ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParsePointerToMemberType(const DWARFDIE &die,
+                                        const ParsedDWARFTypeAttributes &attrs);
 
   /// Parses a DW_TAG_inheritance DIE into a base/super class.
   ///
@@ -391,8 +371,7 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// \param layout_info The layout information that will be updated for C++
   /// base classes with the base offset.
   void ParseInheritance(
-      const lldb_private::plugin::dwarf::DWARFDIE &die,
-      const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+      const DWARFDIE &die, const DWARFDIE &parent_die,
       const lldb_private::CompilerType class_clang_type,
       const lldb::AccessType default_accessibility,
       const lldb::ModuleSP &module_sp,
@@ -409,8 +388,7 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
   /// \param layout_info The layout information that will be updated for
   //   base classes with the base offset
   void
-  ParseRustVariantPart(lldb_private::plugin::dwarf::DWARFDIE &die,
-                       const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+  ParseRustVariantPart(DWARFDIE &die, const DWARFDIE &parent_die,
                        lldb_private::CompilerType &class_clang_type,
                        const lldb::AccessType default_accesibility,
                        lldb_private::ClangASTImporter::LayoutInfo &layout_info);
@@ -420,8 +398,9 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
 /// Some attributes are relevant for all kinds of types (declaration), while
 /// others are only meaningful to a specific type (is_virtual)
 struct ParsedDWARFTypeAttributes {
-  explicit ParsedDWARFTypeAttributes(
-      const lldb_private::plugin::dwarf::DWARFDIE &die);
+  typedef lldb_private::plugin::dwarf::DWARFDIE DWARFDIE;
+
+  explicit ParsedDWARFTypeAttributes(const DWARFDIE &die);
 
   lldb::AccessType accessibility = lldb::eAccessNone;
   bool is_artificial = false;
@@ -438,7 +417,7 @@ struct ParsedDWARFTypeAttributes {
   const char *mangled_name = nullptr;
   lldb_private::ConstString name;
   lldb_private::Declaration decl;
-  lldb_private::plugin::dwarf::DWARFDIE object_pointer;
+  DWARFDIE object_pointer;
   lldb_private::plugin::dwarf::DWARFFormValue abstract_origin;
   lldb_private::plugin::dwarf::DWARFFormValue containing_type;
   lldb_private::plugin::dwarf::DWARFFormValue signature;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
index 4884374ef9472..03e289bbf3300 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDIE.cpp
@@ -13,6 +13,7 @@
 #include "DWARFDebugInfoEntry.h"
 #include "DWARFDeclContext.h"
 #include "DWARFUnit.h"
+#include "lldb/Symbol/Type.h"
 
 #include "llvm/ADT/iterator.h"
 
@@ -379,108 +380,118 @@ std::vector<DWARFDIE> DWARFDIE::GetDeclContextDIEs() const {
   return result;
 }
 
-static std::vector<lldb_private::CompilerContext>
-GetDeclContextImpl(llvm::SmallSet<lldb::user_id_t, 4> &seen, DWARFDIE die) {
-  std::vector<lldb_private::CompilerContext> context;
+static void GetDeclContextImpl(DWARFDIE die,
+                               llvm::SmallSet<lldb::user_id_t, 4> &seen,
+                               std::vector<CompilerContext> &context) {
   // Stop if we hit a cycle.
-  if (!die || !seen.insert(die.GetID()).second)
-    return context;
-
-  // Handle outline member function DIEs by following the specification.
-  if (DWARFDIE spec = die.GetReferencedDIE(DW_AT_specification))
-    return GetDeclContextImpl(seen, spec);
-
-  // Get the parent context chain.
-  context = GetDeclContextImpl(seen, die.GetParent());
+  while (die && seen.insert(die.GetID()).second) {
+    // Handle outline member function DIEs by following the specification.
+    if (DWARFDIE spec = die.GetReferencedDIE(DW_AT_specification)) {
+      die = spec;
+      continue;
+    }
 
-  // Add this DIE's contribution at the end of the chain.
-  auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) {
-    context.push_back({kind, ConstString(name)});
-  };
-  switch (die.Tag()) {
-  case DW_TAG_module:
-    push_ctx(CompilerContextKind::Module, die.GetName());
-    break;
-  case DW_TAG_namespace:
-    push_ctx(CompilerContextKind::Namespace, die.GetName());
-    break;
-  case DW_TAG_structure_type:
-    push_ctx(CompilerContextKind::Struct, die.GetName());
-    break;
-  case DW_TAG_union_type:
-    push_ctx(CompilerContextKind::Union, die.GetName());
-    break;
-  case DW_TAG_class_type:
-    push_ctx(CompilerContextKind::Class, die.GetName());
-    break;
-  case DW_TAG_enumeration_type:
-    push_ctx(CompilerContextKind::Enum, die.GetName());
-    break;
-  case DW_TAG_subprogram:
-    push_ctx(CompilerContextKind::Function, die.GetName());
-    break;
-  case DW_TAG_variable:
-    push_ctx(CompilerContextKind::Variable, die.GetPubname());
-    break;
-  case DW_TAG_typedef:
-    push_ctx(CompilerContextKind::Typedef, die.GetName());
-    break;
-  default:
-    break;
+    // Add this DIE's contribution at the end of the chain.
+    auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) {
+      context.push_back({kind, ConstString(name)});
+    };
+    switch (die.Tag()) {
+    case DW_TAG_module:
+      push_ctx(CompilerContextKind::Module, die.GetName());
+      break;
+    case DW_TAG_namespace:
+      push_ctx(CompilerContextKind::Namespace, die.GetName());
+      break;
+    case DW_TAG_structure_type:
+      push_ctx(CompilerContextKind::Struct, die.GetName());
+      break;
+    case DW_TAG_union_type:
+      push_ctx(CompilerContextKind::Union, die.GetName());
+      break;
+    case DW_TAG_class_type:
+      push_ctx(CompilerContextKind::Class, die.GetName());
+      break;
+    case DW_TAG_enumeration_type:
+      push_ctx(CompilerContextKind::Enum, die.GetName());
+      break;
+    case DW_TAG_subprogram:
+      push_ctx(CompilerContextKind::Function, die.GetName());
+      break;
+    case DW_TAG_variable:
+      push_ctx(CompilerContextKind::Variable, die.GetPubname());
+      break;
+    case DW_TAG_typedef:
+      push_ctx(CompilerContextKind::Typedef, die.GetName());
+      break;
+    default:
+      break;
+    }
+    // Now process the parent.
+    die = die.GetParent();
   }
-  return context;
 }
 
-std::vector<lldb_private::CompilerContext> DWARFDIE::GetDeclContext() const {
+std::vector<CompilerContext> DWARFDIE::GetDeclContext() const {
   llvm::SmallSet<lldb::user_id_t, 4> seen;
-  return GetDeclContextImpl(seen, *this);
+  std::vector<CompilerContext> context;
+  GetDeclContextImpl(*this, seen, context);
+  std::reverse(context.begin(), context.end());
+  return context;
 }
 
-std::vector<lldb_private::CompilerContext>
-DWARFDIE::GetTypeLookupContext() const {
-  std::vector<lldb_private::CompilerContext> context;
-  // If there is no name, then there is no need to look anything up for this
-  // DIE.
-  const char *name = GetName();
-  if (!name || !name[0])
-    return context;
-  const dw_tag_t tag = Tag();
-  if (tag == DW_TAG_compile_unit || tag == DW_TAG_partial_unit)
-    return context;
-  DWARFDIE parent = GetParent();
-  if (parent)
-    context = parent.GetTypeLookupContext();
-  auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) {
-    context.push_back({kind, ConstString(name)});
-  };
-  switch (tag) {
-  case DW_TAG_namespace:
-    push_ctx(CompilerContextKind::Namespace, name);
-    break;
-  case DW_TAG_structure_type:
-    push_ctx(CompilerContextKind::Struct, name);
-    break;
-  case DW_TAG_union_type:
-    push_ctx(CompilerContextKind::Union, name);
-    break;
-  case DW_TAG_class_type:
-    push_ctx(CompilerContextKind::Class, name);
-    break;
-  case DW_TAG_enumeration_type:
-    push_ctx(CompilerContextKind::Enum, name);
-    break;
-  case DW_TAG_variable:
-    push_ctx(CompilerContextKind::Variable, GetPubname());
-    break;
-  case DW_TAG_typedef:
-    push_ctx(CompilerContextKind::Typedef, name);
-    break;
-  case DW_TAG_base_type:
-    push_ctx(CompilerContextKind::Builtin, name);
-    break;
-  default:
-    break;
+static void GetTypeLookupContextImpl(DWARFDIE die,
+                                     llvm::SmallSet<lldb::user_id_t, 4> &seen,
+                                     std::vector<CompilerContext> &context) {
+  // Stop if we hit a cycle.
+  while (die && seen.insert(die.GetID()).second) {
+    // If there is no name, then there is no need to look anything up for this
+    // DIE.
+    const char *name = die.GetName();
+    if (!name || !name[0])
+      return;
+
+    // Add this DIE's contribution at the end of the chain.
+    auto push_ctx = [&](CompilerContextKind kind, llvm::StringRef name) {
+      context.push_back({kind, ConstString(name)});
+    };
+    switch (die.Tag()) {
+    case DW_TAG_namespace:
+      push_ctx(CompilerContextKind::Namespace, die.GetName());
+      break;
+    case DW_TAG_structure_type:
+      push_ctx(CompilerContextKind::Struct, die.GetName());
+      break;
+    case DW_TAG_union_type:
+      push_ctx(CompilerContextKind::Union, die.GetName());
+      break;
+    case DW_TAG_class_type:
+      push_ctx(CompilerContextKind::Class, die.GetName());
+      break;
+    case DW_TAG_enumeration_type:
+      push_ctx(CompilerContextKind::Enum, die.GetName());
+      break;
+    case DW_TAG_variable:
+      push_ctx(CompilerContextKind::Variable, die.GetPubname());
+      break;
+    case DW_TAG_typedef:
+      push_ctx(CompilerContextKind::Typedef, die.GetName());
+      break;
+    case DW_TAG_base_type:
+      push_ctx(CompilerContextKind::Builtin, name);
+      break;
+    default:
+      break;
+    }
+    // Now process the parent.
+    die = die.GetParent();
   }
+}
+
+std::vector<CompilerContext> DWARFDIE::GetTypeLookupContext() const {
+  llvm::SmallSet<lldb::user_id_t, 4> seen;
+  std::vector<CompilerContext> context;
+  GetTypeLookupContextImpl(*this, seen, context);
+  std::reverse(context.begin(), context.end());
   return context;
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp
index 20c07a94b5076..30fb5d5ebdb0d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.cpp
@@ -24,16 +24,11 @@ using namespace lldb_private::plugin::dwarf;
 DWARFIndex::~DWARFIndex() = default;
 
 bool DWARFIndex::ProcessFunctionDIE(
-    const Module::LookupInfo &lookup_info, DIERef ref, SymbolFileDWARF &dwarf,
+    const Module::LookupInfo &lookup_info, DWARFDIE die,
     const CompilerDeclContext &parent_decl_ctx,
     llvm::function_ref<bool(DWARFDIE die)> callback) {
   llvm::StringRef name = lookup_info.GetLookupName().GetStringRef();
   FunctionNameType name_type_mask = lookup_info.GetNameTypeMask();
-  DWARFDIE die = dwarf.GetDIE(ref);
-  if (!die) {
-    ReportInvalidDIERef(ref, name);
-    return true;
-  }
 
   if (!(name_type_mask & eFunctionNameTypeFull)) {
     ConstString name_to_match_against;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h
index 0551b07100a96..cb3ae8a06d788 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFIndex.h
@@ -81,11 +81,10 @@ class DWARFIndex {
   StatsDuration m_index_time;
 
   /// Helper function implementing common logic for processing function dies. If
-  /// the function given by "ref" matches search criteria given by
-  /// "parent_decl_ctx" and "name_type_mask", it is inserted into the "dies"
-  /// vector.
-  bool ProcessFunctionDIE(const Module::LookupInfo &lookup_info, DIERef ref,
-                          SymbolFileDWARF &dwarf,
+  /// the function given by "die" matches search criteria given by
+  /// "parent_decl_ctx" and "name_type_mask", it calls the callback with the
+  /// given die.
+  bool ProcessFunctionDIE(const Module::LookupInfo &lookup_info, DWARFDIE die,
                           const CompilerDeclContext &parent_decl_ctx,
                           llvm::function_ref<bool(DWARFDIE die)> callback);
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
index 79400e36e04f3..56717bab1ecd8 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
@@ -64,29 +64,31 @@ DebugNamesDWARFIndex::GetNonSkeletonUnit(const DebugNames::Entry &entry) const {
   return cu ? &cu->GetNonSkeletonUnit() : nullptr;
 }
 
-std::optional<DIERef>
-DebugNamesDWARFIndex::ToDIERef(const DebugNames::Entry &entry) const {
+DWARFDIE DebugNamesDWARFIndex::GetDIE(const DebugNames::Entry &entry) const {
   DWARFUnit *unit = GetNonSkeletonUnit(entry);
-  if (!unit)
-    return std::nullopt;
-  if (std::optional<uint64_t> die_offset = entry.getDIEUnitOffset())
-    return DIERef(unit->GetSymbolFileDWARF().GetFileIndex(),
-                  DIERef::Section::DebugInfo, unit->GetOffset() + *die_offset);
-
-  return std::nullopt;
+  std::optional<uint64_t> die_offset = entry.getDIEUnitOffset();
+  if (!unit || !die_offset)
+    return DWARFDIE();
+  if (DWARFDIE die = unit->GetDIE(unit->GetOffset() + *die_offset))
+    return die;
+
+  m_module.ReportErrorIfModifyDetected(
+      "the DWARF debug information has been modified (bad offset {0:x} in "
+      "debug_names section)\n",
+      *die_offset);
+  return DWARFDIE();
 }
 
 bool DebugNamesDWARFIndex::ProcessEntry(
     const DebugNames::Entry &entry,
     llvm::function_ref<bool(DWARFDIE die)> callback) {
-  std::optional<DIERef> ref = ToDIERef(entry);
-  if (!ref)
-    return true;
-  SymbolFileDWARF &dwarf = *llvm::cast<SymbolFileDWARF>(
-      m_module.GetSymbolFile()->GetBackingSymbolFile());
-  DWARFDIE die = dwarf.GetDIE(*ref);
+  DWARFDIE die = GetDIE(entry);
   if (!die)
     return true;
+  // Clang erroneously emits index entries for declaration DIEs in case when the
+  // definition is in a type unit (llvm.org/pr77696). Weed those out.
+  if (die.GetAttributeValueAsUnsigned(DW_AT_declaration, 0))
+    return true;
   return callback(die);
 }
 
@@ -183,7 +185,7 @@ void DebugNamesDWARFIndex::GetCompleteObjCClass(
     llvm::function_ref<bool(DWARFDIE die)> callback) {
   // Keep a list of incomplete types as fallback for when we don't find the
   // complete type.
-  DIEArray incomplete_types;
+  std::vector<DWARFDIE> incomplete_types;
 
   for (const DebugNames::Entry &entry :
        m_debug_names_up->equal_range(class_name.GetStringRef())) {
@@ -191,19 +193,14 @@ void DebugNamesDWARFIndex::GetCompleteObjCClass(
         entry.tag() != DW_TAG_class_type)
       continue;
 
-    std::optional<DIERef> ref = ToDIERef(entry);
-    if (!ref)
-      continue;
-
-    DWARFUnit *cu = m_debug_info.GetUnit(*ref);
-    if (!cu || !cu->Supports_DW_AT_APPLE_objc_complete_type()) {
-      incomplete_types.push_back(*ref);
+    DWARFDIE die = GetDIE(entry);
+    if (!die) {
+      // Report invalid
       continue;
     }
-
-    DWARFDIE die = m_debug_info.GetDIE(*ref);
-    if (!die) {
-      ReportInvalidDIERef(*ref, class_name.GetStringRef());
+    DWARFUnit *cu = die.GetCU();
+    if (!cu->Supports_DW_AT_APPLE_objc_complete_type()) {
+      incomplete_types.push_back(die);
       continue;
     }
 
@@ -212,12 +209,11 @@ void DebugNamesDWARFIndex::GetCompleteObjCClass(
       callback(die);
       return;
     }
-    incomplete_types.push_back(*ref);
+    incomplete_types.push_back(die);
   }
 
-  auto dierefcallback = DIERefCallback(callback, class_name.GetStringRef());
-  for (DIERef ref : incomplete_types)
-    if (!dierefcallback(ref))
+  for (DWARFDIE die : incomplete_types)
+    if (!callback(die))
       return;
 
   m_fallback.GetCompleteObjCClass(class_name, must_be_implementation, callback);
@@ -379,8 +375,8 @@ void DebugNamesDWARFIndex::GetFunctions(
     if (tag != DW_TAG_subprogram && tag != DW_TAG_inlined_subroutine)
       continue;
 
-    if (std::optional<DIERef> ref = ToDIERef(entry)) {
-      if (!ProcessFunctionDIE(lookup_info, *ref, dwarf, parent_decl_ctx,
+    if (DWARFDIE die = GetDIE(entry)) {
+      if (!ProcessFunctionDIE(lookup_info, die, parent_decl_ctx,
                               [&](DWARFDIE die) {
                                 if (!seen.insert(die.GetDIE()).second)
                                   return true;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
index 81fb8f88b805a..a27a414ecdd19 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.h
@@ -84,7 +84,7 @@ class DebugNamesDWARFIndex : public DWARFIndex {
   ManualDWARFIndex m_fallback;
 
   DWARFUnit *GetNonSkeletonUnit(const DebugNames::Entry &entry) const;
-  std::optional<DIERef> ToDIERef(const DebugNames::Entry &entry) const;
+  DWARFDIE GetDIE(const DebugNames::Entry &entry) const;
   bool ProcessEntry(const DebugNames::Entry &entry,
                     llvm::function_ref<bool(DWARFDIE die)> callback);
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index f6f152726bf74..bc489e5b8ad46 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -481,6 +481,13 @@ static ConstString GetDWARFMachOSegmentName() {
   return g_dwarf_section_name;
 }
 
+llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> &
+SymbolFileDWARF::GetForwardDeclCompilerTypeToDIE() {
+  if (SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile())
+    return debug_map_symfile->GetForwardDeclCompilerTypeToDIE();
+  return m_forward_decl_compiler_type_to_die;
+}
+
 UniqueDWARFASTTypeMap &SymbolFileDWARF::GetUniqueDWARFASTTypeMap() {
   SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile();
   if (debug_map_symfile)
@@ -1632,27 +1639,33 @@ bool SymbolFileDWARF::CompleteType(CompilerType &compiler_type) {
     return true;
   }
 
-  DWARFDIE dwarf_die = GetDIE(die_it->getSecond());
-  if (dwarf_die) {
-    // Once we start resolving this type, remove it from the forward
-    // declaration map in case anyone child members or other types require this
-    // type to get resolved. The type will get resolved when all of the calls
-    // to SymbolFileDWARF::ResolveClangOpaqueTypeDefinition are done.
-    GetForwardDeclCompilerTypeToDIE().erase(die_it);
-
-    Type *type = GetDIEToType().lookup(dwarf_die.GetDIE());
+  // Once we start resolving this type, remove it from the forward
+  // declaration map in case anyone's child members or other types require this
+  // type to get resolved.
+  DWARFDIE dwarf_die = GetDIE(die_it->second);
+  GetForwardDeclCompilerTypeToDIE().erase(die_it);
+  Type *type = nullptr;
+  if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU()))
+    type = dwarf_ast->FindDefinitionTypeForDIE(dwarf_die);
+  if (!type)
+    return false;
 
-    Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion);
-    if (log)
-      GetObjectFile()->GetModule()->LogMessageVerboseBacktrace(
-          log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...",
-          dwarf_die.GetID(), DW_TAG_value_to_name(dwarf_die.Tag()),
-          dwarf_die.Tag(), type->GetName().AsCString());
-    assert(compiler_type);
-    if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU()))
-      return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type);
+  die_it = GetForwardDeclCompilerTypeToDIE().find(
+      compiler_type_no_qualifiers.GetOpaqueQualType());
+  if (die_it != GetForwardDeclCompilerTypeToDIE().end()) {
+    dwarf_die = GetDIE(die_it->getSecond());
+    GetForwardDeclCompilerTypeToDIE().erase(die_it);
   }
-  return false;
+
+  if (Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion))
+    GetObjectFile()->GetModule()->LogMessageVerboseBacktrace(
+        log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...",
+        dwarf_die.GetID(), DW_TAG_value_to_name(dwarf_die.Tag()),
+        dwarf_die.Tag(), type->GetName().AsCString());
+  assert(compiler_type);
+  if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU()))
+    return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type);
+  return true;
 }
 
 Type *SymbolFileDWARF::ResolveType(const DWARFDIE &die,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index 7282c08c6857c..35893f2072dd6 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -335,12 +335,8 @@ class SymbolFileDWARF : public SymbolFileCommon {
 
   virtual DIEToTypePtr &GetDIEToType() { return m_die_to_type; }
 
-  typedef llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef>
-      CompilerTypeToDIE;
-
-  virtual CompilerTypeToDIE &GetForwardDeclCompilerTypeToDIE() {
-    return m_forward_decl_compiler_type_to_die;
-  }
+  virtual llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> &
+  GetForwardDeclCompilerTypeToDIE();
 
   typedef llvm::DenseMap<const DWARFDebugInfoEntry *, lldb::VariableSP>
       DIEToVariableSP;
@@ -533,9 +529,14 @@ class SymbolFileDWARF : public SymbolFileCommon {
   NameToOffsetMap m_function_scope_qualified_name_map;
   std::unique_ptr<DWARFDebugRanges> m_ranges;
   UniqueDWARFASTTypeMap m_unique_ast_type_map;
+  // A map from DIE to lldb_private::Type. For record type, the key might be
+  // either declaration DIE or definition DIE.
   DIEToTypePtr m_die_to_type;
   DIEToVariableSP m_die_to_variable_sp;
-  CompilerTypeToDIE m_forward_decl_compiler_type_to_die;
+  // A map from CompilerType to the struct/class/union/enum DIE (might be a
+  // declaration or a definition) that is used to construct it.
+  llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef>
+      m_forward_decl_compiler_type_to_die;
   llvm::DenseMap<dw_offset_t, std::unique_ptr<SupportFileList>>
       m_type_unit_support_files;
   std::vector<uint32_t> m_lldb_cu_to_dwarf_unit;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
index de22dd676eef0..d7d571919bc7d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h
@@ -284,6 +284,11 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon {
   lldb::TypeSP FindCompleteObjCDefinitionTypeForDIE(
       const DWARFDIE &die, ConstString type_name, bool must_be_implementation);
 
+  llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> &
+  GetForwardDeclCompilerTypeToDIE() {
+    return m_forward_decl_compiler_type_to_die;
+  }
+
   UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() {
     return m_unique_ast_type_map;
   }
@@ -321,6 +326,10 @@ class SymbolFileDWARFDebugMap : public SymbolFileCommon {
   std::vector<uint32_t> m_func_indexes; // Sorted by address
   std::vector<uint32_t> m_glob_indexes;
   std::map<std::pair<ConstString, llvm::sys::TimePoint<>>, OSOInfoSP> m_oso_map;
+  // A map from CompilerType to the struct/class/union/enum DIE (might be a
+  // declaration or a definition) that is used to construct it.
+  llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef>
+      m_forward_decl_compiler_type_to_die;
   UniqueDWARFASTTypeMap m_unique_ast_type_map;
   LazyBool m_supports_DW_AT_APPLE_objc_complete_type;
   DebugMap m_debug_map;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
index 85e1afd0d8976..8fd369c65f86b 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
@@ -110,7 +110,7 @@ SymbolFileDWARF::DIEToVariableSP &SymbolFileDWARFDwo::GetDIEToVariable() {
   return GetBaseSymbolFile().GetDIEToVariable();
 }
 
-SymbolFileDWARF::CompilerTypeToDIE &
+llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> &
 SymbolFileDWARFDwo::GetForwardDeclCompilerTypeToDIE() {
   return GetBaseSymbolFile().GetForwardDeclCompilerTypeToDIE();
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
index 1500540424b52..2f0ac415e90d4 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
@@ -72,7 +72,8 @@ class SymbolFileDWARFDwo : public SymbolFileDWARF {
 
   DIEToVariableSP &GetDIEToVariable() override;
 
-  CompilerTypeToDIE &GetForwardDeclCompilerTypeToDIE() override;
+  llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> &
+  GetForwardDeclCompilerTypeToDIE() override;
 
   UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() override;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
index 223518f0ae824..4762356034cab 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp
@@ -13,66 +13,67 @@
 using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
 
-bool UniqueDWARFASTTypeList::Find(const DWARFDIE &die,
-                                  const lldb_private::Declaration &decl,
-                                  const int32_t byte_size,
-                                  UniqueDWARFASTType &entry) const {
-  for (const UniqueDWARFASTType &udt : m_collection) {
+UniqueDWARFASTType *UniqueDWARFASTTypeList::Find(
+    const DWARFDIE &die, const lldb_private::Declaration &decl,
+    const int32_t byte_size, bool is_forward_declaration) {
+  for (UniqueDWARFASTType &udt : m_collection) {
     // Make sure the tags match
     if (udt.m_die.Tag() == die.Tag()) {
-      // Validate byte sizes of both types only if both are valid.
-      if (udt.m_byte_size < 0 || byte_size < 0 ||
-          udt.m_byte_size == byte_size) {
-        // Make sure the file and line match
-        if (udt.m_declaration == decl) {
-          // The type has the same name, and was defined on the same file and
-          // line. Now verify all of the parent DIEs match.
-          DWARFDIE parent_arg_die = die.GetParent();
-          DWARFDIE parent_pos_die = udt.m_die.GetParent();
-          bool match = true;
-          bool done = false;
-          while (!done && match && parent_arg_die && parent_pos_die) {
-            const dw_tag_t parent_arg_tag = parent_arg_die.Tag();
-            const dw_tag_t parent_pos_tag = parent_pos_die.Tag();
-            if (parent_arg_tag == parent_pos_tag) {
-              switch (parent_arg_tag) {
-              case DW_TAG_class_type:
-              case DW_TAG_structure_type:
-              case DW_TAG_union_type:
-              case DW_TAG_namespace: {
-                const char *parent_arg_die_name = parent_arg_die.GetName();
-                if (parent_arg_die_name ==
-                    nullptr) // Anonymous (i.e. no-name) struct
-                {
-                  match = false;
-                } else {
-                  const char *parent_pos_die_name = parent_pos_die.GetName();
-                  if (parent_pos_die_name == nullptr ||
-                      ((parent_arg_die_name != parent_pos_die_name) &&
-                       strcmp(parent_arg_die_name, parent_pos_die_name)))
-                    match = false;
-                }
-              } break;
-
-              case DW_TAG_compile_unit:
-              case DW_TAG_partial_unit:
-                done = true;
-                break;
-              default:
-                break;
-              }
+      // If they are not both definition DIEs or both declaration DIEs, then
+      // don't check for byte size and declaration location, because declaration
+      // DIEs usually don't have those info.
+      bool matching_size_declaration =
+          udt.m_is_forward_declaration != is_forward_declaration
+              ? true
+              : (udt.m_byte_size < 0 || byte_size < 0 ||
+                 udt.m_byte_size == byte_size) &&
+                    udt.m_declaration == decl;
+      if (!matching_size_declaration)
+        continue;
+      // The type has the same name, and was defined on the same file and
+      // line. Now verify all of the parent DIEs match.
+      DWARFDIE parent_arg_die = die.GetParent();
+      DWARFDIE parent_pos_die = udt.m_die.GetParent();
+      bool match = true;
+      bool done = false;
+      while (!done && match && parent_arg_die && parent_pos_die) {
+        const dw_tag_t parent_arg_tag = parent_arg_die.Tag();
+        const dw_tag_t parent_pos_tag = parent_pos_die.Tag();
+        if (parent_arg_tag == parent_pos_tag) {
+          switch (parent_arg_tag) {
+          case DW_TAG_class_type:
+          case DW_TAG_structure_type:
+          case DW_TAG_union_type:
+          case DW_TAG_namespace: {
+            const char *parent_arg_die_name = parent_arg_die.GetName();
+            if (parent_arg_die_name == nullptr) {
+              // Anonymous (i.e. no-name) struct
+              match = false;
+            } else {
+              const char *parent_pos_die_name = parent_pos_die.GetName();
+              if (parent_pos_die_name == nullptr ||
+                  ((parent_arg_die_name != parent_pos_die_name) &&
+                   strcmp(parent_arg_die_name, parent_pos_die_name)))
+                match = false;
             }
-            parent_arg_die = parent_arg_die.GetParent();
-            parent_pos_die = parent_pos_die.GetParent();
-          }
+          } break;
 
-          if (match) {
-            entry = udt;
-            return true;
+          case DW_TAG_compile_unit:
+          case DW_TAG_partial_unit:
+            done = true;
+            break;
+          default:
+            break;
           }
         }
+        parent_arg_die = parent_arg_die.GetParent();
+        parent_pos_die = parent_pos_die.GetParent();
+      }
+
+      if (match) {
+        return &udt;
       }
     }
   }
-  return false;
+  return nullptr;
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h
index bf3cbae55e5c7..29e5c02dcbe17 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h
@@ -23,31 +23,19 @@ class UniqueDWARFASTType {
   // Constructors and Destructors
   UniqueDWARFASTType() : m_type_sp(), m_die(), m_declaration() {}
 
-  UniqueDWARFASTType(lldb::TypeSP &type_sp, const DWARFDIE &die,
-                     const Declaration &decl, int32_t byte_size)
-      : m_type_sp(type_sp), m_die(die), m_declaration(decl),
-        m_byte_size(byte_size) {}
-
   UniqueDWARFASTType(const UniqueDWARFASTType &rhs)
       : m_type_sp(rhs.m_type_sp), m_die(rhs.m_die),
-        m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size) {}
+        m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size),
+        m_is_forward_declaration(rhs.m_is_forward_declaration) {}
 
   ~UniqueDWARFASTType() = default;
 
-  UniqueDWARFASTType &operator=(const UniqueDWARFASTType &rhs) {
-    if (this != &rhs) {
-      m_type_sp = rhs.m_type_sp;
-      m_die = rhs.m_die;
-      m_declaration = rhs.m_declaration;
-      m_byte_size = rhs.m_byte_size;
-    }
-    return *this;
-  }
-
   lldb::TypeSP m_type_sp;
   DWARFDIE m_die;
   Declaration m_declaration;
   int32_t m_byte_size = -1;
+  // True if the m_die is a forward declaration DIE.
+  bool m_is_forward_declaration = true;
 };
 
 class UniqueDWARFASTTypeList {
@@ -62,8 +50,9 @@ class UniqueDWARFASTTypeList {
     m_collection.push_back(entry);
   }
 
-  bool Find(const DWARFDIE &die, const Declaration &decl,
-            const int32_t byte_size, UniqueDWARFASTType &entry) const;
+  UniqueDWARFASTType *Find(const DWARFDIE &die, const Declaration &decl,
+                           const int32_t byte_size,
+                           bool is_forward_declaration);
 
 protected:
   typedef std::vector<UniqueDWARFASTType> collection;
@@ -80,14 +69,15 @@ class UniqueDWARFASTTypeMap {
     m_collection[name.GetCString()].Append(entry);
   }
 
-  bool Find(ConstString name, const DWARFDIE &die, const Declaration &decl,
-            const int32_t byte_size, UniqueDWARFASTType &entry) const {
+  UniqueDWARFASTType *Find(ConstString name, const DWARFDIE &die,
+                           const Declaration &decl, const int32_t byte_size,
+                           bool is_forward_declaration) {
     const char *unique_name_cstr = name.GetCString();
-    collection::const_iterator pos = m_collection.find(unique_name_cstr);
+    collection::iterator pos = m_collection.find(unique_name_cstr);
     if (pos != m_collection.end()) {
-      return pos->second.Find(die, decl, byte_size, entry);
+      return pos->second.Find(die, decl, byte_size, is_forward_declaration);
     }
-    return false;
+    return nullptr;
   }
 
 protected:
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
index fab3ca989c0ec..17c5f6118603f 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
@@ -47,15 +47,18 @@ UdtRecordCompleter::UdtRecordCompleter(
   CVType cvt = m_index.tpi().getType(m_id.index);
   switch (cvt.kind()) {
   case LF_ENUM:
+    m_cvr.er.Options = ClassOptions::None;
     llvm::cantFail(TypeDeserializer::deserializeAs<EnumRecord>(cvt, m_cvr.er));
     break;
   case LF_UNION:
+    m_cvr.ur.Options = ClassOptions::None;
     llvm::cantFail(TypeDeserializer::deserializeAs<UnionRecord>(cvt, m_cvr.ur));
     m_layout.bit_size = m_cvr.ur.getSize() * 8;
     m_record.record.kind = Member::Union;
     break;
   case LF_CLASS:
   case LF_STRUCTURE:
+    m_cvr.cr.Options = ClassOptions::None;
     llvm::cantFail(TypeDeserializer::deserializeAs<ClassRecord>(cvt, m_cvr.cr));
     m_layout.bit_size = m_cvr.cr.getSize() * 8;
     m_record.record.kind = Member::Struct;
diff --git a/lldb/source/Symbol/Block.cpp b/lldb/source/Symbol/Block.cpp
index 6eeabe0ff5e4d..f7d9c0d2d3306 100644
--- a/lldb/source/Symbol/Block.cpp
+++ b/lldb/source/Symbol/Block.cpp
@@ -314,6 +314,22 @@ bool Block::GetRangeAtIndex(uint32_t range_idx, AddressRange &range) {
   return false;
 }
 
+AddressRanges Block::GetRanges() {
+  AddressRanges ranges;
+  Function *function = CalculateSymbolContextFunction();
+  if (!function)
+    return ranges;
+  for (size_t i = 0, e = m_ranges.GetSize(); i < e; ++i) {
+    ranges.emplace_back();
+    auto &range = ranges.back();
+    const Range &vm_range = m_ranges.GetEntryRef(i);
+    range.GetBaseAddress() = function->GetAddressRange().GetBaseAddress();
+    range.GetBaseAddress().Slide(vm_range.GetRangeBase());
+    range.SetByteSize(vm_range.GetByteSize());
+  }
+  return ranges;
+}
+
 bool Block::GetStartAddress(Address &addr) {
   if (m_ranges.IsEmpty())
     return false;
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index 6bf69c2ded287..585808ace15ce 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -36,6 +36,13 @@
 using namespace lldb;
 using namespace lldb_private;
 
+llvm::raw_ostream &lldb_private::operator<<(llvm::raw_ostream &os,
+                                            const CompilerContext &rhs) {
+  StreamString lldb_stream;
+  rhs.Dump(lldb_stream);
+  return os << lldb_stream.GetString();
+}
+
 bool lldb_private::contextMatches(llvm::ArrayRef<CompilerContext> context_chain,
                                   llvm::ArrayRef<CompilerContext> pattern) {
   auto ctx = context_chain.begin();
diff --git a/lldb/source/Target/RegisterFlags.cpp b/lldb/source/Target/RegisterFlags.cpp
index b1669b85fd2fe..5274960587bf3 100644
--- a/lldb/source/Target/RegisterFlags.cpp
+++ b/lldb/source/Target/RegisterFlags.cpp
@@ -190,7 +190,7 @@ std::string RegisterFlags::AsTable(uint32_t max_width) const {
   return table;
 }
 
-void RegisterFlags::ToXML(StreamString &strm) const {
+void RegisterFlags::ToXML(Stream &strm) const {
   // Example XML:
   // <flags id="cpsr_flags" size="4">
   //   <field name="incorrect" start="0" end="0"/>
@@ -213,7 +213,7 @@ void RegisterFlags::ToXML(StreamString &strm) const {
   strm.Indent("</flags>\n");
 }
 
-void RegisterFlags::Field::ToXML(StreamString &strm) const {
+void RegisterFlags::Field::ToXML(Stream &strm) const {
   // Example XML:
   // <field name="correct" start="0" end="0"/>
   strm.Indent();
diff --git a/lldb/test/API/commands/expression/fixits/TestFixIts.py b/lldb/test/API/commands/expression/fixits/TestFixIts.py
index bc53b72fe611b..1b22ed1c0077c 100644
--- a/lldb/test/API/commands/expression/fixits/TestFixIts.py
+++ b/lldb/test/API/commands/expression/fixits/TestFixIts.py
@@ -106,9 +106,8 @@ def test_with_target_error_applies_fixit(self):
         )
         self.assertIn("null_pointer->first", ret_val.GetError())
 
-    # The final function call runs into SIGILL on aarch64-linux.
     @expectedFailureAll(
-        archs=["aarch64"], oslist=["freebsd", "linux"], bugnumber="llvm.org/pr49407"
+        archs=["aarch64"], oslist=["freebsd"], bugnumber="llvm.org/pr49407"
     )
     def test_with_multiple_retries(self):
         """Test calling expressions with errors that can be fixed by the FixIts."""
diff --git a/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py b/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py
index 5fc37ac6a5818..ea3aa6a4608c4 100644
--- a/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py
+++ b/lldb/test/API/commands/expression/static-initializers/TestStaticInitializers.py
@@ -7,8 +7,8 @@
 class StaticInitializers(TestBase):
     @expectedFailureAll(
         archs="aarch64",
-        oslist=["freebsd", "linux"],
-        bugnumber="https://bugs.llvm.org/show_bug.cgi?id=44053",
+        oslist=["freebsd"],
+        bugnumber="llvm.org/pr44053",
     )
     def test(self):
         """Test a static initializer."""
diff --git a/lldb/test/API/python_api/address_range/Makefile b/lldb/test/API/python_api/address_range/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/python_api/address_range/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/python_api/address_range/TestAddressRange.py b/lldb/test/API/python_api/address_range/TestAddressRange.py
new file mode 100644
index 0000000000000..65221e3f1b0e9
--- /dev/null
+++ b/lldb/test/API/python_api/address_range/TestAddressRange.py
@@ -0,0 +1,254 @@
+"""
+Test SBAddressRange APIs.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+
+
+class AddressRangeTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def setUp(self):
+        TestBase.setUp(self)
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+
+        self.target = self.dbg.CreateTarget(exe)
+        self.assertTrue(self.target, VALID_TARGET)
+        self.launch_info = self.target.GetLaunchInfo()
+        self.launch_info.SetWorkingDirectory(self.get_process_working_directory())
+
+        self.bp1 = self.target.BreakpointCreateByName("main", "a.out")
+        self.bp2 = self.target.BreakpointCreateByName("foo", "a.out")
+        self.bp3 = self.target.BreakpointCreateByName("bar", "a.out")
+
+        self.assertTrue(self.bp1.IsValid())
+        self.assertTrue(self.bp2.IsValid())
+        self.assertTrue(self.bp3.IsValid())
+
+        self.addr1 = self.bp1.GetLocationAtIndex(0).GetAddress()
+        self.addr2 = self.bp2.GetLocationAtIndex(0).GetAddress()
+        self.addr3 = self.bp3.GetLocationAtIndex(0).GetAddress()
+
+        self.assertTrue(self.addr1.IsValid())
+        self.assertTrue(self.addr2.IsValid())
+        self.assertTrue(self.addr3.IsValid())
+
+    def test_address_range_default(self):
+        """Testing default constructor."""
+        empty_range = lldb.SBAddressRange()
+        self.assertEqual(empty_range.IsValid(), False)
+
+    def test_address_range_construction(self):
+        """Make sure the construction and getters work."""
+        range = lldb.SBAddressRange(self.addr1, 8)
+        self.assertEqual(range.IsValid(), True)
+        self.assertEqual(range.GetBaseAddress(), self.addr1)
+        self.assertEqual(range.GetByteSize(), 8)
+
+    def test_address_range_clear(self):
+        """Make sure the clear method works."""
+        range = lldb.SBAddressRange(self.addr1, 8)
+        self.assertEqual(range.IsValid(), True)
+        self.assertEqual(range.GetBaseAddress(), self.addr1)
+        self.assertEqual(range.GetByteSize(), 8)
+
+        range.Clear()
+        self.assertEqual(range.IsValid(), False)
+
+    def test_function(self):
+        """Make sure the range works in SBFunction APIs."""
+
+        # Setup breakpoints in main
+        loc = self.bp1.GetLocationAtIndex(0)
+        loc_addr = loc.GetAddress()
+        func = loc_addr.GetFunction()
+        ranges = func.GetRanges()
+        self.assertEqual(ranges.GetSize(), 1)
+
+        range = ranges.GetAddressRangeAtIndex(0)
+        self.assertEqual(
+            range.GetByteSize(),
+            func.GetEndAddress().GetOffset() - func.GetStartAddress().GetOffset(),
+        )
+        self.assertEqual(
+            range.GetBaseAddress().GetOffset(),
+            func.GetStartAddress().GetOffset(),
+        )
+
+    def test_block(self):
+        """Make sure the range works in SBBlock APIs."""
+        loc = self.bp1.GetLocationAtIndex(0)
+        loc_addr = loc.GetAddress()
+        block = loc_addr.GetBlock()
+
+        ranges = block.GetRanges()
+        self.assertEqual(ranges.GetSize(), 1)
+
+        range = ranges.GetAddressRangeAtIndex(0)
+        self.assertEqual(
+            range.GetByteSize(),
+            block.GetRangeEndAddress(0).GetOffset()
+            - block.GetRangeStartAddress(0).GetOffset(),
+        )
+        self.assertEqual(
+            range.GetBaseAddress().GetOffset(),
+            block.GetRangeStartAddress(0).GetOffset(),
+        )
+
+    def test_address_range_list(self):
+        """Make sure the SBAddressRangeList works by adding and getting ranges."""
+        range1 = lldb.SBAddressRange(self.addr1, 8)
+        range2 = lldb.SBAddressRange(self.addr2, 16)
+        range3 = lldb.SBAddressRange(self.addr3, 32)
+
+        range_list = lldb.SBAddressRangeList()
+        self.assertEqual(range_list.GetSize(), 0)
+
+        range_list.Append(range1)
+        range_list.Append(range2)
+        range_list.Append(range3)
+        self.assertEqual(range_list.GetSize(), 3)
+        self.assertRaises(IndexError, lambda: range_list[3])
+
+        range1_copy = range_list.GetAddressRangeAtIndex(0)
+        self.assertEqual(range1.GetByteSize(), range1_copy.GetByteSize())
+        self.assertEqual(
+            range1.GetBaseAddress().GetOffset(),
+            range1_copy.GetBaseAddress().GetOffset(),
+        )
+
+        range2_copy = range_list.GetAddressRangeAtIndex(1)
+        self.assertEqual(range2.GetByteSize(), range2_copy.GetByteSize())
+        self.assertEqual(
+            range2.GetBaseAddress().GetOffset(),
+            range2_copy.GetBaseAddress().GetOffset(),
+        )
+
+        range3_copy = range_list.GetAddressRangeAtIndex(2)
+        self.assertEqual(range3.GetByteSize(), range3_copy.GetByteSize())
+        self.assertEqual(
+            range3.GetBaseAddress().GetOffset(),
+            range3_copy.GetBaseAddress().GetOffset(),
+        )
+
+        range_list.Clear()
+        self.assertEqual(range_list.GetSize(), 0)
+
+    def test_address_range_list_len(self):
+        """Make sure the len() operator works."""
+        range = lldb.SBAddressRange(self.addr1, 8)
+
+        range_list = lldb.SBAddressRangeList()
+        self.assertEqual(len(range_list), 0)
+
+        range_list.Append(range)
+        self.assertEqual(len(range_list), 1)
+
+    def test_address_range_list_iterator(self):
+        """Make sure the SBAddressRangeList iterator works."""
+        range1 = lldb.SBAddressRange(self.addr1, 8)
+        range2 = lldb.SBAddressRange(self.addr2, 16)
+        range3 = lldb.SBAddressRange(self.addr3, 32)
+
+        range_list = lldb.SBAddressRangeList()
+        range_list.Append(range1)
+        range_list.Append(range2)
+        range_list.Append(range3)
+        self.assertEqual(range_list.GetSize(), 3)
+
+        # Test the iterator
+        for range in range_list:
+            self.assertTrue(range.IsValid())
+
+    def test_address_range_print_invalid(self):
+        """Make sure the SBAddressRange can be printed when invalid."""
+        range = lldb.SBAddressRange()
+        self.assertEqual(str(range), "<invalid>")
+
+    def test_address_range_print_resolved(self):
+        """Make sure the SBAddressRange can be printed when resolved."""
+        lldb.target = self.target
+        error = lldb.SBError()
+        process = self.target.Launch(self.launch_info, error)
+        self.assertTrue(error.Success(), "Make sure process launched successfully")
+        self.assertTrue(process, PROCESS_IS_VALID)
+        self.assertState(process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        loc = self.bp1.GetLocationAtIndex(0)
+        loc_addr = loc.GetAddress()
+        func = loc_addr.GetFunction()
+        range = func.GetRanges().GetAddressRangeAtIndex(0)
+        range_str = str(range)
+        # [0x1000-0x2000] // Resolved with target or addresses without sections
+        self.assertRegex(range_str, "^\[0x[0-9a-f]+\-0x[0-9a-f]+\)$")
+        process.Kill()
+
+    def test_address_range_print_no_section_resolved(self):
+        """Make sure the SBAddressRange can be printed with no secion."""
+        lldb.target = self.target
+        error = lldb.SBError()
+        process = self.target.Launch(self.launch_info, error)
+        self.assertTrue(error.Success(), "Make sure process launched successfully")
+        self.assertTrue(process, PROCESS_IS_VALID)
+        self.assertState(process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        loc = self.bp1.GetLocationAtIndex(0)
+        loc_addr = loc.GetAddress()
+        func = loc_addr.GetFunction()
+        range = func.GetRanges().GetAddressRangeAtIndex(0)
+
+        addr = lldb.SBAddress()
+        addr.SetAddress(lldb.SBSection(), range.GetBaseAddress().GetOffset())
+        self.assertFalse(addr.GetSection().IsValid())
+        range = lldb.SBAddressRange(addr, range.GetByteSize())
+
+        range_str = str(range)
+        # [0x1000-0x2000] // Resolved with target or addresses without sections
+        self.assertRegex(range_str, "^\[0x[0-9a-f]+\-0x[0-9a-f]+\)$")
+        process.Kill()
+
+    def test_address_range_print_not_resolved(self):
+        """Make sure the SBAddressRange can be printed when not resolved."""
+        range = lldb.SBAddressRange(self.addr1, 8)
+        range_str = str(range)
+        # a.out[0x1000-0x2000] // Without target
+        self.assertRegex(range_str, "^a.out\[0x[0-9a-f]+\-0x[0-9a-f]+\)$")
+
+    def test_address_range_list_print(self):
+        """Make sure the SBAddressRangeList can be printed."""
+        range1 = lldb.SBAddressRange(self.addr1, 8)
+        range2 = lldb.SBAddressRange(self.addr2, 16)
+        range3 = lldb.SBAddressRange(self.addr3, 32)
+        self.dbg.SetAsync(True)
+
+        range_list = lldb.SBAddressRangeList()
+        self.assertEqual(range_list.GetSize(), 0)
+
+        range_list.Append(range1)
+        range_list.Append(range2)
+        range_list.Append(range3)
+        self.assertEqual(range_list.GetSize(), 3)
+
+        range_list_str = str(range_list)
+        self.assertTrue(range_list_str.startswith("["))
+        self.assertGreater(range_list_str.count(","), 1)
+        self.assertTrue(range_list_str.endswith("]"))
+
+    def test_address_range_list_indexing(self):
+        """Make sure the SBAddressRangeList can be printed."""
+        range1 = lldb.SBAddressRange(self.addr1, 8)
+        range2 = lldb.SBAddressRange(self.addr2, 16)
+        range_list = lldb.SBAddressRangeList()
+        range_list.Append(range1)
+        range_list.Append(range2)
+
+        self.assertEqual(range_list.GetSize(), 2)
+        self.assertRaises(IndexError, lambda: range_list[2])
+        self.assertRaises(TypeError, lambda: range_list["0"])
+        self.assertEqual(range_list[0], range1)
+        self.assertEqual(range_list[1], range2)
+        self.assertEqual(range_list[-1], range2)
+        self.assertEqual(range_list[-2], range1)
diff --git a/lldb/test/API/python_api/address_range/main.cpp b/lldb/test/API/python_api/address_range/main.cpp
new file mode 100644
index 0000000000000..b6eaec4a23699
--- /dev/null
+++ b/lldb/test/API/python_api/address_range/main.cpp
@@ -0,0 +1,8 @@
+void foo() {}
+void bar() {}
+
+int main() {
+  foo();
+  bar();
+  return 0;
+}
diff --git a/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test
new file mode 100644
index 0000000000000..d253981b498c8
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test
@@ -0,0 +1,36 @@
+# Test definition DIE searching is delayed until complete type is required.
+
+# UNSUPPORTED: system-windows
+
+# RUN: split-file %s %t
+# RUN: %clangxx_host %t/main.cpp %t/t1_def.cpp -gdwarf -o %t.out
+# RUN: %lldb -b %t.out -s %t/lldb.cmd | FileCheck %s
+
+# CHECK: (lldb) p v1
+# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't2<t1>'
+# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1'
+# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't2<t1>' resolving forward declaration...
+# CHECK: (t2<t1>)  {}
+# CHECK: (lldb) p v2
+# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1'
+# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't1' resolving forward declaration...
+
+#--- lldb.cmd
+log enable dwarf comp
+p v1
+p v2
+
+#--- main.cpp
+template<typename T>
+struct t2 {
+};
+struct t1;
+t2<t1> v1; // this CU doesn't have definition DIE for t1, but only declaration DIE for it.
+int main() {
+}
+
+#--- t1_def.cpp
+struct t1 { // this CU contains definition DIE for t1.
+  int x;
+};
+t1 v2;
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index c7eb3db4304a9..d419f821999e6 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -103,7 +103,9 @@ void DAP::SendJSON(const llvm::json::Value &json) {
   SendJSON(json_str);
 
   if (log) {
-    *log << "<-- " << std::endl
+    auto now = std::chrono::duration<double>(
+        std::chrono::system_clock::now().time_since_epoch());
+    *log << llvm::formatv("{0:f9} <-- ", now.count()).str() << std::endl
          << "Content-Length: " << json_str.size() << "\r\n\r\n"
          << llvm::formatv("{0:2}", json).str() << std::endl;
   }
@@ -130,9 +132,12 @@ std::string DAP::ReadJSON() {
   if (!input.read_full(log.get(), length, json_str))
     return json_str;
 
-  if (log)
-    *log << "--> " << std::endl << "Content-Length: " << length << "\r\n\r\n";
-
+  if (log) {
+    auto now = std::chrono::duration<double>(
+        std::chrono::system_clock::now().time_since_epoch());
+    *log << llvm::formatv("{0:f9} --> ", now.count()).str() << std::endl
+         << "Content-Length: " << length << "\r\n\r\n";
+  }
   return json_str;
 }
 
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index 20742ea512309..bea07dfa27cc6 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -10,6 +10,8 @@
 #include "Plugins/SymbolFile/DWARF/DWARFDebugInfo.h"
 #include "TestingSupport/Symbol/YAMLModuleTester.h"
 #include "lldb/Core/dwarf.h"
+#include "lldb/Symbol/Type.h"
+#include "lldb/lldb-private-enumerations.h"
 #include "llvm/ADT/STLExtras.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -187,3 +189,72 @@ TEST(DWARFDIETest, PeekName) {
   dw_offset_t fifth_die_offset = 26;
   EXPECT_EQ(unit->PeekDIEName(fifth_die_offset), "NameType2");
 }
+
+TEST(DWARFDIETest, GetContext) {
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_386
+DWARF:
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_namespace
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x000000000000000C
+        - AbbrCode:        0x2
+          Values:
+            - CStr:            NAMESPACE
+        - AbbrCode:        0x3
+          Values:
+            - CStr:            STRUCT
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+)";
+
+  YAMLModuleTester t(yamldata);
+  auto *symbol_file =
+      llvm::cast<SymbolFileDWARF>(t.GetModule()->GetSymbolFile());
+  DWARFUnit *unit = symbol_file->DebugInfo().GetUnitAtIndex(0);
+  ASSERT_TRUE(unit);
+
+  auto make_namespace = [](llvm::StringRef name) {
+    return CompilerContext(CompilerContextKind::Namespace, ConstString(name));
+  };
+  auto make_struct = [](llvm::StringRef name) {
+    return CompilerContext(CompilerContextKind::Struct, ConstString(name));
+  };
+  DWARFDIE struct_die = unit->DIE().GetFirstChild().GetFirstChild();
+  ASSERT_TRUE(struct_die);
+  EXPECT_THAT(
+      struct_die.GetDeclContext(),
+      testing::ElementsAre(make_namespace("NAMESPACE"), make_struct("STRUCT")));
+  EXPECT_THAT(
+      struct_die.GetTypeLookupContext(),
+      testing::ElementsAre(make_namespace("NAMESPACE"), make_struct("STRUCT")));
+}
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 612e90abd4091..64898ab09772f 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -560,6 +560,8 @@ set(LLVM_USE_STATIC_ZSTD FALSE CACHE BOOL "Use static version of zstd. Can be TR
 
 set(LLVM_ENABLE_CURL "OFF" CACHE STRING "Use libcurl for the HTTP client if available. Can be ON, OFF, or FORCE_ON")
 
+set(LLVM_HAS_LOGF128 "OFF" CACHE STRING "Use logf128 to constant fold fp128 logarithm calls. Can be ON, OFF, or FORCE_ON")
+
 set(LLVM_ENABLE_HTTPLIB "OFF" CACHE STRING "Use cpp-httplib HTTP server library if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_Z3_INSTALL_DIR "" CACHE STRING "Install directory of the Z3 solver.")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 8cfb36b0194e8..0aae13e30f2ab 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -247,6 +247,17 @@ else()
   set(HAVE_LIBEDIT 0)
 endif()
 
+if(LLVM_HAS_LOGF128)
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(logf128 math.h HAS_LOGF128)
+
+  if(LLVM_HAS_LOGF128 STREQUAL FORCE_ON AND NOT HAS_LOGF128)
+    message(FATAL_ERROR "Failed to configure logf128")
+  endif()
+
+  set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
+endif()
+
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 1004956ac8f10..b827524e6b8db 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -64,7 +64,7 @@ to specify the target triple:
      Vendor       Description
      ============ ==============================================================
      ``amd``      Can be used for all AMD GPU usage.
-     ``mesa3d``   Can be used if the OS is ``mesa3d``.
+     ``mesa``     Can be used if the OS is ``mesa3d``.
      ============ ==============================================================
 
   .. table:: AMDGPU Operating Systems
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 614dd98b013b3..7b64c477d13c7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4754,6 +4754,40 @@ reference to the CFI jump table in the ``LowerTypeTests`` pass. These constants
 may be useful in low-level programs, such as operating system kernels, which
 need to refer to the actual function body.
 
+.. _ptrauth_constant:
+
+Pointer Authentication Constants
+--------------------------------
+
+``ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?)``
+
+A '``ptrauth``' constant represents a pointer with a cryptographic
+authentication signature embedded into some bits, as described in the
+`Pointer Authentication <PointerAuth.html>`__ document.
+
+A '``ptrauth``' constant is simply a constant equivalent to the
+``llvm.ptrauth.sign`` intrinsic, potentially fed by a discriminator
+``llvm.ptrauth.blend`` if needed.
+
+Its type is the same as the first argument.  An integer constant discriminator
+and an address discriminator may be optionally specified.  Otherwise, they have
+values ``i64 0`` and ``ptr null``.
+
+If the address discriminator is ``null`` then the expression is equivalent to
+
+.. code-block:: llvm
+
+    %tmp = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr CST to i64), i32 KEY, i64 DISC)
+    %val = inttoptr i64 %tmp to ptr
+
+Otherwise, the expression is equivalent to:
+
+.. code-block:: llvm
+
+    %tmp1 = call i64 @llvm.ptrauth.blend(i64 ptrtoint (ptr ADDRDISC to i64), i64 DISC)
+    %tmp2 = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr CST to i64), i32 KEY, i64 %tmp1)
+    %val = inttoptr i64 %tmp2 to ptr
+
 .. _constantexprs:
 
 Constant Expressions
diff --git a/llvm/docs/PointerAuth.md b/llvm/docs/PointerAuth.md
index a8d2b4d8f5f0b..cf2cc6305f130 100644
--- a/llvm/docs/PointerAuth.md
+++ b/llvm/docs/PointerAuth.md
@@ -16,6 +16,7 @@ For more details, see the clang documentation page for
 At the IR level, it is represented using:
 
 * a [set of intrinsics](#intrinsics) (to sign/authenticate pointers)
+* a [signed pointer constant](#constant) (to sign globals)
 * a [call operand bundle](#operand-bundle) (to authenticate called pointers)
 
 The current implementation leverages the
@@ -225,6 +226,27 @@ with a pointer address discriminator, in a way that is specified by the target
 implementation.
 
 
+### Constant
+
+[Intrinsics](#intrinsics) can be used to produce signed pointers dynamically,
+in code, but not for signed pointers referenced by constants, in, e.g., global
+initializers.
+
+The latter are represented using a
+[``ptrauth`` constant](https://llvm.org/docs/LangRef.html#ptrauth-constant),
+which describes an authenticated relocation producing a signed pointer.
+
+```llvm
+ptrauth (ptr CST, i32 KEY, i64 DISC, ptr ADDRDISC)
+```
+
+is equivalent to:
+
+```llvm
+  %disc = call i64 @llvm.ptrauth.blend(i64 ptrtoint(ptr ADDRDISC to i64), i64 DISC)
+  %signedval = call i64 @llvm.ptrauth.sign(ptr CST, i32 KEY, i64 %disc)
+```
+
 ### Operand Bundle
 
 Function pointers used as indirect call targets can be signed when materialized,
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 657b0fb9b6724..de27f6b2372db 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -141,10 +141,16 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
      - Allows generating arbitrary width integer types.
    * - ``SPV_INTEL_bfloat16_conversion``
      - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values.
+   * - ``SPV_INTEL_cache_controls``
+     - Allows cache control information to be applied to memory access instructions.
    * - ``SPV_INTEL_function_pointers``
      - Allows translation of function pointers.
    * - ``SPV_INTEL_inline_assembly``
      - Allows to use inline assembly.
+   * - ``SPV_INTEL_global_variable_host_access``
+     - Adds decorations that can be applied to global (module scope) variables.
+   * - ``SPV_INTEL_global_variable_fpga_decorations``
+     - Adds decorations that can be applied to global (module scope) variables to help code generation for FPGA devices.
    * - ``SPV_INTEL_optnone``
      - Adds OptNoneINTEL value for Function Control mask that indicates a request to not optimize the function.
    * - ``SPV_INTEL_subgroups``
diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 9140923e5e8c9..a468ff51d2a6a 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -55,6 +55,7 @@ username for an individual isn't available, the brackets will be empty.
 * Serge Guelton (Mozilla) [@serge-sans-paille]
 * Shayne Hiet-Block (Microsoft) [@GreatKeeper]
 * Tim Penge (Sony) []
+* Tulio Magno Quites Machado Filho (Red Hat) [@tuliom]
 * Will Huhn (Intel) [@wphuhn-intel]
 
 Criteria
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index deb74cb2fdeb1..44a301ecc9928 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/float128.h"
 #include <memory>
 
 #define APFLOAT_DISPATCH_ON_SEMANTICS(METHOD_CALL)                             \
@@ -354,6 +355,9 @@ class IEEEFloat final : public APFloatBase {
   Expected<opStatus> convertFromString(StringRef, roundingMode);
   APInt bitcastToAPInt() const;
   double convertToDouble() const;
+#ifdef HAS_IEE754_FLOAT128
+  float128 convertToQuad() const;
+#endif
   float convertToFloat() const;
 
   /// @}
@@ -1218,6 +1222,15 @@ class APFloat : public APFloatBase {
   /// shorter semantics, like IEEEsingle and others.
   double convertToDouble() const;
 
+  /// Converts this APFloat to host float value.
+  ///
+  /// \pre The APFloat must be built using semantics, that can be represented by
+  /// the host float type without loss of precision. It can be IEEEquad and
+  /// shorter semantics, like IEEEdouble and others.
+#ifdef HAS_IEE754_FLOAT128
+  float128 convertToQuad() const;
+#endif
+
   /// Converts this APFloat to host float value.
   ///
   /// \pre The APFloat must be built using semantics, that can be represented by
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 2fd8b7ea636c4..6cfa6ec665084 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -17,6 +17,7 @@
 
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/float128.h"
 #include <cassert>
 #include <climits>
 #include <cstring>
@@ -1677,6 +1678,13 @@ class [[nodiscard]] APInt {
   /// any bit width. Exactly 64 bits will be translated.
   double bitsToDouble() const { return llvm::bit_cast<double>(getWord(0)); }
 
+#ifdef HAS_IEE754_FLOAT128
+  float128 bitsToQuad() const {
+    __uint128_t ul = ((__uint128_t)U.pVal[1] << 64) + U.pVal[0];
+    return llvm::bit_cast<float128>(ul);
+  }
+#endif
+
   /// Converts APInt bits to a float
   ///
   /// The conversion does not do a translation from integer to float, it just
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 5828cc156cc78..72f3d94542496 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -912,6 +912,13 @@ class ScalarEvolution {
     return getBackedgeTakenCount(L, SymbolicMaximum);
   }
 
+  /// Similar to getSymbolicMaxBackedgeTakenCount, except it will add a set of
+  /// SCEV predicates to Predicates that are required to be true in order for
+  /// the answer to be correct. Predicates can be checked with run-time
+  /// checks and can be used to perform loop versioning.
+  const SCEV *getPredicatedSymbolicMaxBackedgeTakenCount(
+      const Loop *L, SmallVector<const SCEVPredicate *, 4> &Predicates);
+
   /// Return true if the backedge taken count is either the value returned by
   /// getConstantMaxBackedgeTakenCount or zero.
   bool isBackedgeTakenCountMaxOrZero(const Loop *L);
@@ -1549,7 +1556,9 @@ class ScalarEvolution {
                                ScalarEvolution *SE) const;
 
     /// Get the symbolic max backedge taken count for the loop.
-    const SCEV *getSymbolicMax(const Loop *L, ScalarEvolution *SE);
+    const SCEV *
+    getSymbolicMax(const Loop *L, ScalarEvolution *SE,
+                   SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr);
 
     /// Get the symbolic max backedge taken count for the particular loop exit.
     const SCEV *getSymbolicMax(const BasicBlock *ExitingBlock,
@@ -1746,7 +1755,7 @@ class ScalarEvolution {
 
   /// Similar to getBackedgeTakenInfo, but will add predicates as required
   /// with the purpose of returning complete information.
-  const BackedgeTakenInfo &getPredicatedBackedgeTakenInfo(const Loop *L);
+  BackedgeTakenInfo &getPredicatedBackedgeTakenInfo(const Loop *L);
 
   /// Compute the number of times the specified loop will iterate.
   /// If AllowPredicates is set, we will create new SCEV predicates as
@@ -1761,11 +1770,6 @@ class ScalarEvolution {
   ExitLimit computeExitLimit(const Loop *L, BasicBlock *ExitingBlock,
                              bool AllowPredicates = false);
 
-  /// Return a symbolic upper bound for the backedge taken count of the loop.
-  /// This is more general than getConstantMaxBackedgeTakenCount as it returns
-  /// an arbitrary expression as opposed to only constants.
-  const SCEV *computeSymbolicMaxBackedgeTakenCount(const Loop *L);
-
   // Helper functions for computeExitLimitFromCond to avoid exponential time
   // complexity.
 
@@ -2316,6 +2320,9 @@ class PredicatedScalarEvolution {
   /// Get the (predicated) backedge count for the analyzed loop.
   const SCEV *getBackedgeTakenCount();
 
+  /// Get the (predicated) symbolic max backedge count for the analyzed loop.
+  const SCEV *getSymbolicMaxBackedgeTakenCount();
+
   /// Adds a new predicate.
   void addPredicate(const SCEVPredicate &Pred);
 
@@ -2384,6 +2391,9 @@ class PredicatedScalarEvolution {
 
   /// The backedge taken count.
   const SCEV *BackedgeCount = nullptr;
+
+  /// The symbolic backedge taken count.
+  const SCEV *SymbolicMaxBackedgeCount = nullptr;
 };
 
 template <> struct DenseMapInfo<ScalarEvolution::FoldID> {
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index df61ec6ed30e0..69821c22dcd61 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -346,6 +346,7 @@ enum Kind {
   kw_blockaddress,
   kw_dso_local_equivalent,
   kw_no_cfi,
+  kw_ptrauth,
 
   kw_freeze,
 
diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h
index 38ef8e37df91d..acf89885af6fd 100644
--- a/llvm/include/llvm/BinaryFormat/Wasm.h
+++ b/llvm/include/llvm/BinaryFormat/Wasm.h
@@ -58,15 +58,16 @@ enum : unsigned {
   WASM_TYPE_V128 = 0x7B,
   WASM_TYPE_NULLFUNCREF = 0x73,
   WASM_TYPE_NULLEXTERNREF = 0x72,
+  WASM_TYPE_NULLEXNREF = 0x74,
   WASM_TYPE_NULLREF = 0x71,
   WASM_TYPE_FUNCREF = 0x70,
   WASM_TYPE_EXTERNREF = 0x6F,
+  WASM_TYPE_EXNREF = 0x69,
   WASM_TYPE_ANYREF = 0x6E,
   WASM_TYPE_EQREF = 0x6D,
   WASM_TYPE_I31REF = 0x6C,
   WASM_TYPE_STRUCTREF = 0x6B,
   WASM_TYPE_ARRAYREF = 0x6A,
-  WASM_TYPE_EXNREF = 0x69,
   WASM_TYPE_NONNULLABLE = 0x64,
   WASM_TYPE_NULLABLE = 0x63,
   WASM_TYPE_FUNC = 0x60,
@@ -261,8 +262,9 @@ enum class ValType {
   V128 = WASM_TYPE_V128,
   FUNCREF = WASM_TYPE_FUNCREF,
   EXTERNREF = WASM_TYPE_EXTERNREF,
+  EXNREF = WASM_TYPE_EXNREF,
   // Unmodeled value types include ref types with heap types other than
-  // func or extern, and type-specialized funcrefs
+  // func, extern or exn, and type-specialized funcrefs
   OTHERREF = 0xff,
 };
 
@@ -410,7 +412,8 @@ struct WasmDataSegment {
 // 1) Does not model passive or declarative segments (Segment will end up with
 // an Offset field of i32.const 0)
 // 2) Does not model init exprs (Segment will get an empty Functions list)
-// 2) Does not model types other than basic funcref/externref (see ValType)
+// 3) Does not model types other than basic funcref/externref/exnref (see
+// ValType)
 struct WasmElemSegment {
   uint32_t Flags;
   uint32_t TableNumber;
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index d3b9e96520f88..9999aee61528e 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -413,6 +413,7 @@ enum ConstantsCodes {
                                       //                 asmstr,conststr]
   CST_CODE_CE_GEP_WITH_INRANGE = 31,  // [opty, flags, range, n x operands]
   CST_CODE_CE_GEP = 32,               // [opty, flags, n x operands]
+  CST_CODE_PTRAUTH = 33,              // [ptr, key, disc, addrdisc]
 };
 
 /// CastOpcodes - These are values used in the bitcode files to encode which
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 96a6270690468..0dc237301abb4 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1241,11 +1241,11 @@ class SelectionDAG {
   /// Helper function to make it easier to build Select's if you just have
   /// operands and don't want to check for vector.
   SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS,
-                    SDValue RHS) {
+                    SDValue RHS, SDNodeFlags Flags = SDNodeFlags()) {
     assert(LHS.getValueType() == VT && RHS.getValueType() == VT &&
            "Cannot use select on differing types");
     auto Opcode = Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT;
-    return getNode(Opcode, DL, VT, Cond, LHS, RHS);
+    return getNode(Opcode, DL, VT, Cond, LHS, RHS, Flags);
   }
 
   /// Helper function to make it easier to build SelectCC's if you just have an
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index c3e378ed8f6ed..e322cc04c1c76 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -280,11 +280,12 @@ def untyped   : ValueType<8,    193> { // Produces an untyped value
 }
 def funcref   : ValueType<0,    194>;  // WebAssembly's funcref type
 def externref : ValueType<0,    195>;  // WebAssembly's externref type
-def x86amx    : ValueType<8192, 196>;  // X86 AMX value
-def i64x8     : ValueType<512,  197>;  // 8 Consecutive GPRs (AArch64)
+def exnref    : ValueType<0,    196>;  // WebAssembly's exnref type
+def x86amx    : ValueType<8192, 197>;  // X86 AMX value
+def i64x8     : ValueType<512,  198>;  // 8 Consecutive GPRs (AArch64)
 def aarch64svcount
-              : ValueType<16,  198>;  // AArch64 predicate-as-counter
-def spirvbuiltin : ValueType<0, 199>; // SPIR-V's builtin type
+              : ValueType<16,  199>;  // AArch64 predicate-as-counter
+def spirvbuiltin : ValueType<0, 200>; // SPIR-V's builtin type
 
 def token      : ValueType<0, 248>;  // TokenTy
 def MetadataVT : ValueType<0, 249> { // Metadata
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 6605ea60df99e..629977cc11d68 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -198,4 +198,7 @@
 /* Define if plugins enabled */
 #cmakedefine LLVM_ENABLE_PLUGINS
 
+/* Define if logf128 is available */
+#cmakedefine LLVM_HAS_LOGF128
+
 #endif
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 3fa27608ead94..3feb4bd11c998 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -371,9 +371,8 @@ ConstructDecompositionT<C, H>::addClauseSymsToMap(U &&item,
 // anything and return false, otherwise return true.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyToUnique(const ClauseTy *node) {
-  auto unique = detail::find_unique(leafs, [=](const auto &dirInfo) {
-    return llvm::omp::isAllowedClauseForDirective(dirInfo.id, node->id,
-                                                  version);
+  auto unique = detail::find_unique(leafs, [=](const auto &leaf) {
+    return llvm::omp::isAllowedClauseForDirective(leaf.id, node->id, version);
   });
 
   if (unique != leafs.end()) {
@@ -438,8 +437,8 @@ bool ConstructDecompositionT<C, H>::applyToAll(const ClauseTy *node) {
 }
 
 template <typename C, typename H>
-template <typename Clause>
-bool ConstructDecompositionT<C, H>::applyClause(Clause &&clause,
+template <typename Specific>
+bool ConstructDecompositionT<C, H>::applyClause(Specific &&specific,
                                                 const ClauseTy *node) {
   // The default behavior is to find the unique directive to which the
   // given clause may be applied. If there are no such directives, or
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index a1e5005a9d1da..86f6be7985a23 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -1008,6 +1008,72 @@ struct OperandTraits<NoCFIValue> : public FixedNumOperandTraits<NoCFIValue, 1> {
 
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(NoCFIValue, Value)
 
+/// A signed pointer, in the ptrauth sense.
+class ConstantPtrAuth final : public Constant {
+  friend struct ConstantPtrAuthKeyType;
+  friend class Constant;
+
+  ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc,
+                  Constant *AddrDisc);
+
+  void *operator new(size_t s) { return User::operator new(s, 4); }
+
+  void destroyConstantImpl();
+  Value *handleOperandChangeImpl(Value *From, Value *To);
+
+public:
+  /// Return a pointer signed with the specified parameters.
+  static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key,
+                              ConstantInt *Disc, Constant *AddrDisc);
+
+  /// Produce a new ptrauth expression signing the given value using
+  /// the same schema as is stored in one.
+  ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const;
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
+
+  /// The pointer that is signed in this ptrauth signed pointer.
+  Constant *getPointer() const { return cast<Constant>(Op<0>().get()); }
+
+  /// The Key ID, an i32 constant.
+  ConstantInt *getKey() const { return cast<ConstantInt>(Op<1>().get()); }
+
+  /// The integer discriminator, an i64 constant, or 0.
+  ConstantInt *getDiscriminator() const {
+    return cast<ConstantInt>(Op<2>().get());
+  }
+
+  /// The address discriminator if any, or the null constant.
+  /// If present, this must be a value equivalent to the storage location of
+  /// the only global-initializer user of the ptrauth signed pointer.
+  Constant *getAddrDiscriminator() const {
+    return cast<Constant>(Op<3>().get());
+  }
+
+  /// Whether there is any non-null address discriminator.
+  bool hasAddressDiscriminator() const {
+    return !getAddrDiscriminator()->isNullValue();
+  }
+
+  /// Check whether an authentication operation with key \p Key and (possibly
+  /// blended) discriminator \p Discriminator is known to be compatible with
+  /// this ptrauth signed pointer.
+  bool isKnownCompatibleWith(const Value *Key, const Value *Discriminator,
+                             const DataLayout &DL) const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const Value *V) {
+    return V->getValueID() == ConstantPtrAuthVal;
+  }
+};
+
+template <>
+struct OperandTraits<ConstantPtrAuth>
+    : public FixedNumOperandTraits<ConstantPtrAuth, 4> {};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantPtrAuth, Constant)
+
 //===----------------------------------------------------------------------===//
 /// A constant value that is initialized with an expression using
 /// other constant values.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 3019f68083d42..107442623ab7b 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -581,6 +581,7 @@ def llvm_vararg_ty     : LLVMType<isVoid>;   // this means vararg here
 
 def llvm_externref_ty  : LLVMType<externref>;
 def llvm_funcref_ty    : LLVMType<funcref>;
+def llvm_exnref_ty     : LLVMType<exnref>;
 
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 4544cf35fb7b3..9a71aaa9f4434 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3121,6 +3121,11 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
                             [llvm_nxv4f32_ty, llvm_nxv4f32_ty],
                             [IntrNoMem]>;
+  
+  class SME2_CVT_WIDENING_VG2_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                            [LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
+  
 
   class SME2_CVT_VG4_SINGLE_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
@@ -3356,6 +3361,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_bfmlslt : SME2_BFMLS_Intrinsic;
   def int_aarch64_sve_bfmlslt_lane : SME2_BFMLS_Lane_Intrinsic;
 
+  // Multi-vector zeroing
+
+  foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
+    def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty],  [IntrNoMem, IntrHasSideEffects]>;
+  }
+  
   // Multi-vector signed saturating doubling multiply high
 
   def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic;
@@ -3412,6 +3423,13 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_suvdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
   def int_aarch64_sme_usvdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
 
+
+  //
+  //Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
+  //
+  
+  def int_aarch64_sve_fcvtl_widen_x2  : SME2_CVT_WIDENING_VG2_Intrinsic;
+
   //
   // Multi-vector floating-point CVT from single-precision to interleaved half-precision/BFloat16
   //
@@ -3431,7 +3449,7 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic;
   def int_aarch64_sve_scvtf_x4  : SME2_CVT_X4_Intrinsic;
   def int_aarch64_sve_ucvtf_x4  : SME2_CVT_X4_Intrinsic;
-
+  def int_aarch64_sve_fcvt_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
   //
   // Multi-vector saturating extract narrow
   //
@@ -3472,10 +3490,12 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_sclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_uclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_fclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_bfclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
 
   def int_aarch64_sve_sclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_uclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
   def int_aarch64_sve_fclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
+  def int_aarch64_sve_bfclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
 
   //
   // Multi-vector add/sub and accumulate into ZA
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 237f268784bb0..47aab196a6d4f 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -31,12 +31,17 @@ def int_wasm_ref_null_extern :
   DefaultAttrsIntrinsic<[llvm_externref_ty], [], [IntrNoMem]>;
 def int_wasm_ref_null_func :
   DefaultAttrsIntrinsic<[llvm_funcref_ty], [], [IntrNoMem]>;
+def int_wasm_ref_null_exn:
+  DefaultAttrsIntrinsic<[llvm_exnref_ty], [], [IntrNoMem]>;
 def int_wasm_ref_is_null_extern :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_externref_ty], [IntrNoMem],
                         "llvm.wasm.ref.is_null.extern">;
 def int_wasm_ref_is_null_func :
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_funcref_ty],
                         [IntrNoMem], "llvm.wasm.ref.is_null.func">;
+def int_wasm_ref_is_null_exn :
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_exnref_ty], [IntrNoMem],
+                        "llvm.wasm.ref.is_null.exn">;
 
 //===----------------------------------------------------------------------===//
 // Table intrinsics
@@ -47,6 +52,9 @@ def int_wasm_table_set_externref :
 def int_wasm_table_set_funcref :
   DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_funcref_ty],
                         [IntrWriteMem]>;
+def int_wasm_table_set_exnref :
+  DefaultAttrsIntrinsic<[], [llvm_table_ty, llvm_i32_ty, llvm_exnref_ty],
+                        [IntrWriteMem]>;
 
 def int_wasm_table_get_externref :
   DefaultAttrsIntrinsic<[llvm_externref_ty], [llvm_table_ty, llvm_i32_ty],
@@ -54,6 +62,9 @@ def int_wasm_table_get_externref :
 def int_wasm_table_get_funcref :
   DefaultAttrsIntrinsic<[llvm_funcref_ty], [llvm_table_ty, llvm_i32_ty],
                         [IntrReadMem]>;
+def int_wasm_table_get_exnref :
+  DefaultAttrsIntrinsic<[llvm_exnref_ty], [llvm_table_ty, llvm_i32_ty],
+                        [IntrReadMem]>;
 
 // Query the current table size, and increase the current table size.
 def int_wasm_table_size :
@@ -68,6 +79,9 @@ def int_wasm_table_grow_externref :
 def int_wasm_table_grow_funcref :
   DefaultAttrsIntrinsic<[llvm_i32_ty],
                         [llvm_table_ty, llvm_funcref_ty, llvm_i32_ty], []>;
+def int_wasm_table_grow_exnref :
+  DefaultAttrsIntrinsic<[llvm_i32_ty],
+                        [llvm_table_ty, llvm_exnref_ty, llvm_i32_ty], []>;
 def int_wasm_table_fill_externref :
   DefaultAttrsIntrinsic<[],
                         [llvm_table_ty, llvm_i32_ty, llvm_externref_ty,
@@ -76,6 +90,10 @@ def int_wasm_table_fill_funcref :
   DefaultAttrsIntrinsic<[],
                         [llvm_table_ty, llvm_i32_ty, llvm_funcref_ty,
                          llvm_i32_ty], []>;
+def int_wasm_table_fill_exnref :
+  DefaultAttrsIntrinsic<[],
+                        [llvm_table_ty, llvm_i32_ty, llvm_exnref_ty,
+                         llvm_i32_ty], []>;
 
 //===----------------------------------------------------------------------===//
 // Trapping float-to-int conversions
diff --git a/llvm/include/llvm/IR/Value.def b/llvm/include/llvm/IR/Value.def
index 61f7a87666d09..3ece66a529e12 100644
--- a/llvm/include/llvm/IR/Value.def
+++ b/llvm/include/llvm/IR/Value.def
@@ -81,6 +81,7 @@ HANDLE_CONSTANT(BlockAddress)
 HANDLE_CONSTANT(ConstantExpr)
 HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(DSOLocalEquivalent)
 HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(NoCFIValue)
+HANDLE_CONSTANT_EXCLUDE_LLVM_C_API(ConstantPtrAuth)
 
 // ConstantAggregate.
 HANDLE_CONSTANT(ConstantArray)
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 66a99f16cdb63..d44a2d1e2fb11 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -28,10 +28,12 @@ enum IndexedVersion : uint64_t {
   Version1 = 1,
   // Version 2: Added a call stack table.
   Version2 = 2,
+  // Version 3: Under development.
+  Version3 = 3,
 };
 
 constexpr uint64_t MinimumSupportedVersion = Version0;
-constexpr uint64_t MaximumSupportedVersion = Version2;
+constexpr uint64_t MaximumSupportedVersion = Version3;
 
 // Verify that the minimum and maximum satisfy the obvious constraint.
 static_assert(MinimumSupportedVersion <= MaximumSupportedVersion);
@@ -426,8 +428,8 @@ struct IndexedMemProfRecord {
   // Convert IndexedMemProfRecord to MemProfRecord.  Callback is used to
   // translate CallStackId to call stacks with frames inline.
   MemProfRecord toMemProfRecord(
-      llvm::function_ref<const llvm::SmallVector<Frame>(const CallStackId)>
-          Callback) const;
+      llvm::function_ref<llvm::SmallVector<Frame>(const CallStackId)> Callback)
+      const;
 
   // Returns the GUID for the function name after canonicalization. For
   // memprof, we remove any .llvm suffix added by LTO. MemProfRecords are
diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h
new file mode 100644
index 0000000000000..e15a98dc5a677
--- /dev/null
+++ b/llvm/include/llvm/Support/float128.h
@@ -0,0 +1,26 @@
+//===-- llvm/Support/float128.h - Compiler abstraction support --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_FLOAT128
+#define LLVM_FLOAT128
+
+namespace llvm {
+
+#if defined(__clang__) && defined(__FLOAT128__) &&                             \
+    defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__)
+#define HAS_IEE754_FLOAT128
+typedef __float128 float128;
+#elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) &&                   \
+    !defined(__LONG_DOUBLE_IBM128__) &&                                        \
+    (defined(__GNUC__) || defined(__GNUG__))
+#define HAS_IEE754_FLOAT128
+typedef _Float128 float128;
+#endif
+
+} // namespace llvm
+#endif // LLVM_FLOAT128
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index b3fff3c99025a..5025ab2491de8 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -183,55 +183,8 @@ struct ExtensionDependency {
   ArchExtKind Later;
 };
 
-// clang-format off
-// Each entry here is a link in the dependency chain starting from the
-// extension that was added to the architecture first.
-inline constexpr ExtensionDependency ExtensionDependencies[] = {
-  {AEK_FP, AEK_FP16},
-  {AEK_FP, AEK_SIMD},
-  {AEK_FP, AEK_JSCVT},
-  {AEK_FP, AEK_FP8},
-  {AEK_SIMD, AEK_CRYPTO},
-  {AEK_SIMD, AEK_AES},
-  {AEK_SIMD, AEK_SHA2},
-  {AEK_SIMD, AEK_SHA3},
-  {AEK_SIMD, AEK_SM4},
-  {AEK_SIMD, AEK_RDM},
-  {AEK_SIMD, AEK_DOTPROD},
-  {AEK_SIMD, AEK_FCMA},
-  {AEK_FP16, AEK_FP16FML},
-  {AEK_FP16, AEK_SVE},
-  {AEK_BF16, AEK_SME},
-  {AEK_BF16, AEK_B16B16},
-  {AEK_SVE, AEK_SVE2},
-  {AEK_SVE, AEK_F32MM},
-  {AEK_SVE, AEK_F64MM},
-  {AEK_SVE2, AEK_SVE2P1},
-  {AEK_SVE2, AEK_SVE2BITPERM},
-  {AEK_SVE2, AEK_SVE2AES},
-  {AEK_SVE2, AEK_SVE2SHA3},
-  {AEK_SVE2, AEK_SVE2SM4},
-  {AEK_SVE2, AEK_SMEFA64},
-  {AEK_SVE2, AEK_SMEFA64},
-  {AEK_SME, AEK_SME2},
-  {AEK_SME, AEK_SMEF16F16},
-  {AEK_SME, AEK_SMEF64F64},
-  {AEK_SME, AEK_SMEI16I64},
-  {AEK_SME, AEK_SMEFA64},
-  {AEK_SME2, AEK_SME2P1},
-  {AEK_SME2, AEK_SSVE_FP8FMA},
-  {AEK_SME2, AEK_SSVE_FP8DOT2},
-  {AEK_SME2, AEK_SSVE_FP8DOT4},
-  {AEK_SME2, AEK_SMEF8F16},
-  {AEK_SME2, AEK_SMEF8F32},
-  {AEK_FP8, AEK_SMEF8F16},
-  {AEK_FP8, AEK_SMEF8F32},
-  {AEK_LSE, AEK_LSE128},
-  {AEK_PREDRES, AEK_SPECRES2},
-  {AEK_RAS, AEK_RASV2},
-  {AEK_RCPC, AEK_RCPC3},
-};
-// clang-format on
+#define EMIT_EXTENSION_DEPENDENCIES
+#include "llvm/TargetParser/AArch64TargetParserDef.inc"
 
 enum ArchProfile { AProfile = 'A', RProfile = 'R', InvalidProfile = '?' };
 
diff --git a/llvm/include/llvm/Transforms/Scalar/Reassociate.h b/llvm/include/llvm/Transforms/Scalar/Reassociate.h
index f3a2e0f4380eb..84d72df6fc4d8 100644
--- a/llvm/include/llvm/Transforms/Scalar/Reassociate.h
+++ b/llvm/include/llvm/Transforms/Scalar/Reassociate.h
@@ -63,6 +63,16 @@ struct Factor {
   Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {}
 };
 
+struct OverflowTracking {
+  bool HasNUW;
+  bool HasNSW;
+  bool AllKnownNonNegative;
+  // Note: AllKnownNonNegative can be true in a case where one of the operands
+  // is negative, but one the operators is not NSW. AllKnownNonNegative should
+  // not be used independently of HasNSW
+  OverflowTracking() : HasNUW(true), HasNSW(true), AllKnownNonNegative(true) {}
+};
+
 class XorOpnd;
 
 } // end namespace reassociate
@@ -103,7 +113,7 @@ class ReassociatePass : public PassInfoMixin<ReassociatePass> {
   void ReassociateExpression(BinaryOperator *I);
   void RewriteExprTree(BinaryOperator *I,
                        SmallVectorImpl<reassociate::ValueEntry> &Ops,
-                       bool HasNUW);
+                       reassociate::OverflowTracking Flags);
   Value *OptimizeExpression(BinaryOperator *I,
                             SmallVectorImpl<reassociate::ValueEntry> &Ops);
   Value *OptimizeAdd(Instruction *I,
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 474b8d20fde16..74476cb5440c6 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -159,3 +159,9 @@ add_llvm_component_library(LLVMAnalysis
   Support
   TargetParser
   )
+
+include(CheckCXXSymbolExists)
+check_cxx_symbol_exists(logf128 math.h HAS_LOGF128)
+if(HAS_LOGF128)
+ target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128)
+endif()
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 705377b97ed90..5febe917126b1 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2087,6 +2087,17 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     if (IntrinsicID == Intrinsic::canonicalize)
       return constantFoldCanonicalize(Ty, Call, U);
 
+#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
+    if (Ty->isFP128Ty()) {
+      switch (IntrinsicID) {
+      default:
+        return nullptr;
+      case Intrinsic::log:
+        return ConstantFP::get(Ty, logf128(Op->getValueAPF().convertToQuad()));
+      }
+    }
+#endif
+
     if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
       return nullptr;
 
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index bc8b9b8479e4f..bd4c2a35ebf2c 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1983,20 +1983,25 @@ getDependenceDistanceStrideAndSize(
     return MemoryDepChecker::Dependence::IndirectUnsafe;
 
   // Check if we can prove that Sink only accesses memory after Src's end or
-  // vice versa.
-  const auto &[SrcStart, SrcEnd] =
-      getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE);
-  const auto &[SinkStart, SinkEnd] =
-      getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE);
-
-  if (!isa<SCEVCouldNotCompute>(SrcStart) &&
-      !isa<SCEVCouldNotCompute>(SrcEnd) &&
-      !isa<SCEVCouldNotCompute>(SinkStart) &&
-      !isa<SCEVCouldNotCompute>(SinkEnd)) {
-    if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart))
-      return MemoryDepChecker::Dependence::NoDep;
-    if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart))
-      return MemoryDepChecker::Dependence::NoDep;
+  // vice versa. At the moment this is limited to cases where either source or
+  // sink are loop invariant to avoid compile-time increases. This is not
+  // required for correctness.
+  if (SE.isLoopInvariant(Src, InnermostLoop) ||
+      SE.isLoopInvariant(Sink, InnermostLoop)) {
+    const auto &[SrcStart, SrcEnd] =
+        getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE);
+    const auto &[SinkStart, SinkEnd] =
+        getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE);
+
+    if (!isa<SCEVCouldNotCompute>(SrcStart) &&
+        !isa<SCEVCouldNotCompute>(SrcEnd) &&
+        !isa<SCEVCouldNotCompute>(SinkStart) &&
+        !isa<SCEVCouldNotCompute>(SinkEnd)) {
+      if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart))
+        return MemoryDepChecker::Dependence::NoDep;
+      if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart))
+        return MemoryDepChecker::Dependence::NoDep;
+    }
   }
 
   // Need accesses with constant strides and the same direction. We don't want
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 8d971e6a78e42..e46d7183a2a35 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8295,6 +8295,11 @@ const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L,
   llvm_unreachable("Invalid ExitCountKind!");
 }
 
+const SCEV *ScalarEvolution::getPredicatedSymbolicMaxBackedgeTakenCount(
+    const Loop *L, SmallVector<const SCEVPredicate *, 4> &Preds) {
+  return getPredicatedBackedgeTakenInfo(L).getSymbolicMax(L, this, &Preds);
+}
+
 bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) {
   return getBackedgeTakenInfo(L).isConstantMaxOrZero(this);
 }
@@ -8311,7 +8316,7 @@ static void PushLoopPHIs(const Loop *L,
       Worklist.push_back(&PN);
 }
 
-const ScalarEvolution::BackedgeTakenInfo &
+ScalarEvolution::BackedgeTakenInfo &
 ScalarEvolution::getPredicatedBackedgeTakenInfo(const Loop *L) {
   auto &BTI = getBackedgeTakenInfo(L);
   if (BTI.hasFullInfo())
@@ -8644,11 +8649,37 @@ ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const {
   return getConstantMax();
 }
 
-const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L,
-                                                   ScalarEvolution *SE) {
-  if (!SymbolicMax)
-    SymbolicMax = SE->computeSymbolicMaxBackedgeTakenCount(L);
+const SCEV *ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(
+    const Loop *L, ScalarEvolution *SE,
+    SmallVector<const SCEVPredicate *, 4> *Predicates) {
+  if (!SymbolicMax) {
+    // Form an expression for the maximum exit count possible for this loop. We
+    // merge the max and exact information to approximate a version of
+    // getConstantMaxBackedgeTakenCount which isn't restricted to just
+    // constants.
+    SmallVector<const SCEV *, 4> ExitCounts;
+
+    for (const auto &ENT : ExitNotTaken) {
+      const SCEV *ExitCount = ENT.SymbolicMaxNotTaken;
+      if (!isa<SCEVCouldNotCompute>(ExitCount)) {
+        assert(SE->DT.dominates(ENT.ExitingBlock, L->getLoopLatch()) &&
+               "We should only have known counts for exiting blocks that "
+               "dominate latch!");
+        ExitCounts.push_back(ExitCount);
+        if (Predicates)
+          for (const auto *P : ENT.Predicates)
+            Predicates->push_back(P);
+
+        assert((Predicates || ENT.hasAlwaysTruePredicate()) &&
+               "Predicate should be always true!");
+      }
+    }
+    if (ExitCounts.empty())
+      SymbolicMax = SE->getCouldNotCompute();
+    else
+      SymbolicMax =
+          SE->getUMinFromMismatchedTypes(ExitCounts, /*Sequential*/ true);
+  }
   return SymbolicMax;
 }
 
@@ -13589,6 +13620,24 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
       P->print(OS, 4);
   }
 
+  Preds.clear();
+  auto *PredSymbolicMax =
+      SE->getPredicatedSymbolicMaxBackedgeTakenCount(L, Preds);
+  if (SymbolicBTC != PredSymbolicMax) {
+    OS << "Loop ";
+    L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+    OS << ": ";
+    if (!isa<SCEVCouldNotCompute>(PredSymbolicMax)) {
+      OS << "Predicated symbolic max backedge-taken count is ";
+      PrintSCEVWithTypeHint(OS, PredSymbolicMax);
+    } else
+      OS << "Unpredictable predicated symbolic max backedge-taken count.";
+    OS << "\n";
+    OS << " Predicates:\n";
+    for (const auto *P : Preds)
+      P->print(OS, 4);
+  }
+
   if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
     OS << "Loop ";
     L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
@@ -14802,6 +14851,17 @@ const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() {
   return BackedgeCount;
 }
 
+const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() {
+  if (!SymbolicMaxBackedgeCount) {
+    SmallVector<const SCEVPredicate *, 4> Preds;
+    SymbolicMaxBackedgeCount =
+        SE.getPredicatedSymbolicMaxBackedgeTakenCount(&L, Preds);
+    for (const auto *P : Preds)
+      addPredicate(*P);
+  }
+  return SymbolicMaxBackedgeCount;
+}
+
 void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
   if (Preds->implies(&Pred))
     return;
@@ -14964,30 +15024,6 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
   return false;
 }
 
-const SCEV *
-ScalarEvolution::computeSymbolicMaxBackedgeTakenCount(const Loop *L) {
-  SmallVector<BasicBlock*, 16> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-
-  // Form an expression for the maximum exit count possible for this loop. We
-  // merge the max and exact information to approximate a version of
-  // getConstantMaxBackedgeTakenCount which isn't restricted to just constants.
-  SmallVector<const SCEV*, 4> ExitCounts;
-  for (BasicBlock *ExitingBB : ExitingBlocks) {
-    const SCEV *ExitCount =
-        getExitCount(L, ExitingBB, ScalarEvolution::SymbolicMaximum);
-    if (!isa<SCEVCouldNotCompute>(ExitCount)) {
-      assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
-             "We should only have known counts for exiting blocks that "
-             "dominate latch!");
-      ExitCounts.push_back(ExitCount);
-    }
-  }
-  if (ExitCounts.empty())
-    return getCouldNotCompute();
-  return getUMinFromMismatchedTypes(ExitCounts, /*Sequential*/ true);
-}
-
 /// A rewriter to replace SCEV expressions in Map with the corresponding entry
 /// in the map. It skips AddRecExpr because we cannot guarantee that the
 /// replacement is loop invariant in the loop of the AddRec.
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 3baa8ede28ffa..08138a5e2f2d9 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3140,6 +3140,10 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts,
       return true;
     }
 
+    // Constant ptrauth can be null, iff the base pointer can be.
+    if (auto *CPA = dyn_cast<ConstantPtrAuth>(V))
+      return isKnownNonZero(CPA->getPointer(), DemandedElts, Q, Depth);
+
     // A global variable in address space 0 is non null unless extern weak
     // or an absolute symbol reference. Other address spaces may have null as a
     // valid address for a global, so we can't assume anything.
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 20a1bd2957712..d3ab306904da1 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -710,6 +710,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(blockaddress);
   KEYWORD(dso_local_equivalent);
   KEYWORD(no_cfi);
+  KEYWORD(ptrauth);
 
   // Metadata types.
   KEYWORD(distinct);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 5d2056d208567..df0827996396e 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4046,6 +4046,60 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
     ID.NoCFI = true;
     return false;
   }
+  case lltok::kw_ptrauth: {
+    // ValID ::= 'ptrauth' '(' ptr @foo ',' i32 <key>
+    //                         (',' i64 <disc> (',' ptr addrdisc)? )? ')'
+    Lex.Lex();
+
+    Constant *Ptr, *Key;
+    Constant *Disc = nullptr, *AddrDisc = nullptr;
+
+    if (parseToken(lltok::lparen,
+                   "expected '(' in constant ptrauth expression") ||
+        parseGlobalTypeAndValue(Ptr) ||
+        parseToken(lltok::comma,
+                   "expected comma in constant ptrauth expression") ||
+        parseGlobalTypeAndValue(Key))
+      return true;
+    // If present, parse the optional disc/addrdisc.
+    if (EatIfPresent(lltok::comma))
+      if (parseGlobalTypeAndValue(Disc) ||
+          (EatIfPresent(lltok::comma) && parseGlobalTypeAndValue(AddrDisc)))
+        return true;
+    if (parseToken(lltok::rparen,
+                   "expected ')' in constant ptrauth expression"))
+      return true;
+
+    if (!Ptr->getType()->isPointerTy())
+      return error(ID.Loc, "constant ptrauth base pointer must be a pointer");
+
+    auto *KeyC = dyn_cast<ConstantInt>(Key);
+    if (!KeyC || KeyC->getBitWidth() != 32)
+      return error(ID.Loc, "constant ptrauth key must be i32 constant");
+
+    ConstantInt *DiscC = nullptr;
+    if (Disc) {
+      DiscC = dyn_cast<ConstantInt>(Disc);
+      if (!DiscC || DiscC->getBitWidth() != 64)
+        return error(
+            ID.Loc,
+            "constant ptrauth integer discriminator must be i64 constant");
+    } else {
+      DiscC = ConstantInt::get(Type::getInt64Ty(Context), 0);
+    }
+
+    if (AddrDisc) {
+      if (!AddrDisc->getType()->isPointerTy())
+        return error(
+            ID.Loc, "constant ptrauth address discriminator must be a pointer");
+    } else {
+      AddrDisc = ConstantPointerNull::get(PointerType::get(Context, 0));
+    }
+
+    ID.ConstantVal = ConstantPtrAuth::get(Ptr, KeyC, DiscC, AddrDisc);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
 
   case lltok::kw_trunc:
   case lltok::kw_bitcast:
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index c085c715179ba..b7ed9cdf63145 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -222,6 +222,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(CST_CODE, CE_UNOP)
       STRINGIFY_CODE(CST_CODE, DSO_LOCAL_EQUIVALENT)
       STRINGIFY_CODE(CST_CODE, NO_CFI_VALUE)
+      STRINGIFY_CODE(CST_CODE, PTRAUTH)
     case bitc::CST_CODE_BLOCKADDRESS:
       return "CST_CODE_BLOCKADDRESS";
       STRINGIFY_CODE(CST_CODE, DATA)
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 32b9a033173e9..aee627bbde0bf 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -517,7 +517,8 @@ class BitcodeConstant final : public Value,
   static constexpr uint8_t NoCFIOpcode = 252;
   static constexpr uint8_t DSOLocalEquivalentOpcode = 251;
   static constexpr uint8_t BlockAddressOpcode = 250;
-  static constexpr uint8_t FirstSpecialOpcode = BlockAddressOpcode;
+  static constexpr uint8_t ConstantPtrAuthOpcode = 249;
+  static constexpr uint8_t FirstSpecialOpcode = ConstantPtrAuthOpcode;
 
   // Separate struct to make passing different number of parameters to
   // BitcodeConstant::create() more convenient.
@@ -1562,6 +1563,18 @@ Expected<Value *> BitcodeReader::materializeValue(unsigned StartValID,
         C = ConstantExpr::get(BC->Opcode, ConstOps[0], ConstOps[1], BC->Flags);
       } else {
         switch (BC->Opcode) {
+        case BitcodeConstant::ConstantPtrAuthOpcode: {
+          auto *Key = dyn_cast<ConstantInt>(ConstOps[1]);
+          if (!Key)
+            return error("ptrauth key operand must be ConstantInt");
+
+          auto *Disc = dyn_cast<ConstantInt>(ConstOps[2]);
+          if (!Disc)
+            return error("ptrauth disc operand must be ConstantInt");
+
+          C = ConstantPtrAuth::get(ConstOps[0], Key, Disc, ConstOps[3]);
+          break;
+        }
         case BitcodeConstant::NoCFIOpcode: {
           auto *GV = dyn_cast<GlobalValue>(ConstOps[0]);
           if (!GV)
@@ -3644,6 +3657,16 @@ Error BitcodeReader::parseConstants() {
                                   Record[1]);
       break;
     }
+    case bitc::CST_CODE_PTRAUTH: {
+      if (Record.size() < 4)
+        return error("Invalid ptrauth record");
+      // Ptr, Key, Disc, AddrDisc
+      V = BitcodeConstant::create(Alloc, CurTy,
+                                  BitcodeConstant::ConstantPtrAuthOpcode,
+                                  {(unsigned)Record[0], (unsigned)Record[1],
+                                   (unsigned)Record[2], (unsigned)Record[3]});
+      break;
+    }
     }
 
     assert(V->getType() == getTypeByID(CurTyID) && "Incorrect result type ID");
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 3d653fe4458f4..046dad5721c4c 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2848,6 +2848,12 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
       Code = bitc::CST_CODE_NO_CFI_VALUE;
       Record.push_back(VE.getTypeID(NC->getGlobalValue()->getType()));
       Record.push_back(VE.getValueID(NC->getGlobalValue()));
+    } else if (const auto *CPA = dyn_cast<ConstantPtrAuth>(C)) {
+      Code = bitc::CST_CODE_PTRAUTH;
+      Record.push_back(VE.getValueID(CPA->getPointer()));
+      Record.push_back(VE.getValueID(CPA->getKey()));
+      Record.push_back(VE.getValueID(CPA->getDiscriminator()));
+      Record.push_back(VE.getValueID(CPA->getAddrDiscriminator()));
     } else {
 #ifndef NDEBUG
       C->dump();
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index c04f7208c61f2..9208b096affad 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3972,7 +3972,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     // target can override this with custom lowering and calling the
     // implementation functions.
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    if (LI.isLegalOrCustom({G_UMIN, Ty}) && LI.isLegalOrCustom({G_UMAX, Ty}))
+    if (LI.isLegalOrCustom({G_UMIN, Ty}))
       return lowerAddSubSatToMinMax(MI);
     return lowerAddSubSatToAddoSubo(MI);
   }
@@ -7977,27 +7977,51 @@ LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
   auto [Dst, Src] = MI.getFirst2Regs();
   const LLT Ty = MRI.getType(Src);
-  unsigned Size = Ty.getSizeInBits();
+  unsigned Size = Ty.getScalarSizeInBits();
+
+  if (Size >= 8) {
+    MachineInstrBuilder BSWAP =
+        MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
+
+    // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
+    //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
+    // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
+    MachineInstrBuilder Swap4 =
+        SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
+
+    // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
+    //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
+    // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
+    MachineInstrBuilder Swap2 =
+        SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
+
+    // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
+    // 6|7
+    //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
+    // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
+    SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
+  } else {
+    // Expand bitreverse for types smaller than 8 bits.
+    MachineInstrBuilder Tmp;
+    for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
+      MachineInstrBuilder Tmp2;
+      if (I < J) {
+        auto ShAmt = MIRBuilder.buildConstant(Ty, J - I);
+        Tmp2 = MIRBuilder.buildShl(Ty, Src, ShAmt);
+      } else {
+        auto ShAmt = MIRBuilder.buildConstant(Ty, I - J);
+        Tmp2 = MIRBuilder.buildLShr(Ty, Src, ShAmt);
+      }
 
-  MachineInstrBuilder BSWAP =
-      MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
-
-  // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
-  //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
-  // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
-  MachineInstrBuilder Swap4 =
-      SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
-
-  // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
-  //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
-  // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
-  MachineInstrBuilder Swap2 =
-      SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
-
-  // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
-  //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
-  // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
-  SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
+      auto Mask = MIRBuilder.buildConstant(Ty, 1U << J);
+      Tmp2 = MIRBuilder.buildAnd(Ty, Tmp2, Mask);
+      if (I == 0)
+        Tmp = Tmp2;
+      else
+        Tmp = MIRBuilder.buildOr(Ty, Tmp, Tmp2);
+    }
+    MIRBuilder.buildCopy(Dst, Tmp);
+  }
 
   MI.eraseFromParent();
   return Legalized;
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 727a98c41bce4..86eb259c09015 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1269,8 +1269,9 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
     Register DefReg = MI.getOperand(0).getReg();
     if (DefReg.isVirtual() &&
         all_of(MI.uses(),
-               [](const MachineOperand &UseOp) {
-                 return !UseOp.isReg() || UseOp.getReg().isVirtual();
+               [this](const MachineOperand &UseOp) {
+                 return !UseOp.isReg() || UseOp.getReg().isVirtual() ||
+                        MRI->isConstantPhysReg(UseOp.getReg());
                }) &&
         IsLoopInvariantInst(MI, CurLoop) &&
         any_of(MRI->use_nodbg_instructions(DefReg),
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 93d866384b482..42e861e61201c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10107,6 +10107,18 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     if (SDValue NewSHL = visitShiftByConstant(N))
       return NewSHL;
 
+  // fold (shl X, cttz(Y)) -> (mul (Y & -Y), X) if cttz is unsupported on the
+  // target.
+  if ((N1.getOpcode() == ISD::CTTZ || N1.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
+      N1.hasOneUse() && !TLI.isOperationLegalOrCustom(ISD::CTTZ, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::MUL, VT)) {
+    SDValue Y = N1.getOperand(0);
+    SDLoc DL(N);
+    SDValue NegY = DAG.getNegative(Y, DL, VT);
+    SDValue And = DAG.getNode(ISD::AND, DL, VT, Y, NegY);
+    return DAG.getNode(ISD::MUL, DL, VT, And, N0);
+  }
+
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
@@ -11186,17 +11198,19 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) {
   return SDValue();
 }
 
-// FIXME: This should be checking for no signed zeros on individual operands, as
-// well as no nans.
 static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS,
-                                         SDValue RHS,
+                                         SDValue RHS, const SDNodeFlags Flags,
                                          const TargetLowering &TLI) {
-  const TargetOptions &Options = DAG.getTarget().Options;
   EVT VT = LHS.getValueType();
+  if (!VT.isFloatingPoint())
+    return false;
+
+  const TargetOptions &Options = DAG.getTarget().Options;
 
-  return Options.NoSignedZerosFPMath && VT.isFloatingPoint() &&
+  return (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) &&
          TLI.isProfitableToCombineMinNumMaxNum(VT) &&
-         DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS);
+         (Flags.hasNoNaNs() ||
+          (DAG.isKnownNeverNaN(RHS) && DAG.isKnownNeverNaN(LHS)));
 }
 
 static SDValue combineMinNumMaxNumImpl(const SDLoc &DL, EVT VT, SDValue LHS,
@@ -11674,7 +11688,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     // select (fcmp gt x, y), x, y -> fmaxnum x, y
     //
     // This is OK if we don't care what happens if either operand is a NaN.
-    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI))
+    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, Flags, TLI))
       if (SDValue FMinMax =
               combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2, CC))
         return FMinMax;
@@ -12267,7 +12281,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     // This is OK if we don't care about what happens if either operand is a
     // NaN.
     //
-    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) {
+    if (N0.hasOneUse() &&
+        isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, N->getFlags(), TLI)) {
       if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC))
         return FMinMax;
     }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index fc96ecdc66280..fb1424f75e097 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2488,6 +2488,8 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FMINIMUM:
     case ISD::FMAXNUM:
     case ISD::FMINNUM:
+    case ISD::FMAXNUM_IEEE:
+    case ISD::FMINNUM_IEEE:
     case ISD::FMUL:
     case ISD::FPOW:
     case ISD::FREM:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8fda35f008632..12f1d005249d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -646,18 +646,21 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
     }
   }
 
-  // Zero extend to the promoted type and do the count there.
-  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-
   // Subtract off the extra leading bits in the bigger type.
   SDValue ExtractLeadingBits = DAG.getConstant(
       NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT);
-  if (!N->isVPOpcode())
+  if (!N->isVPOpcode()) {
+    // Zero extend to the promoted type and do the count there.
+    SDValue Op = ZExtPromotedInteger(N->getOperand(0));
     return DAG.getNode(ISD::SUB, dl, NVT,
                        DAG.getNode(N->getOpcode(), dl, NVT, Op),
                        ExtractLeadingBits);
+  }
+
   SDValue Mask = N->getOperand(1);
   SDValue EVL = N->getOperand(2);
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
   return DAG.getNode(ISD::VP_SUB, dl, NVT,
                      DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
                      ExtractLeadingBits, Mask, EVL);
@@ -681,11 +684,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
   }
 
   // Zero extend to the promoted type and do the count or parity there.
-  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  if (!N->isVPOpcode())
+  if (!N->isVPOpcode()) {
+    SDValue Op = ZExtPromotedInteger(N->getOperand(0));
     return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op);
-  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op,
-                     N->getOperand(1), N->getOperand(2));
+  }
+
+  SDValue Mask = N->getOperand(1);
+  SDValue EVL = N->getOperand(2);
+  SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op, Mask,
+                     EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
@@ -1335,12 +1343,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FFREXP(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_SHL)
+  if (N->getOpcode() != ISD::VP_SHL) {
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+      RHS = ZExtPromotedInteger(RHS);
+
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
+
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = VPZExtPromotedInteger(RHS, Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
@@ -1364,27 +1379,39 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) {
-  // Sign extend the input.
-  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
-  if (N->getNumOperands() == 2)
+  if (N->getNumOperands() == 2) {
+    // Sign extend the input.
+    SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+    SDValue RHS = SExtPromotedInteger(N->getOperand(1));
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
   assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
   assert(N->isVPOpcode() && "Expected VP opcode");
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  // Sign extend the input.
+  SDValue LHS = VPSExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  SDValue RHS = VPSExtPromotedInteger(N->getOperand(1), Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
-  // Zero extend the input.
-  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
-  if (N->getNumOperands() == 2)
+  if (N->getNumOperands() == 2) {
+    // Zero extend the input.
+    SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+    SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
   assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
   assert(N->isVPOpcode() && "Expected VP opcode");
+  // Zero extend the input.
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  SDValue LHS = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  SDValue RHS = VPZExtPromotedInteger(N->getOperand(1), Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
@@ -1400,27 +1427,43 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
-  // The input value must be properly sign extended.
-  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_SRA)
+  if (N->getOpcode() != ISD::VP_SRA) {
+    // The input value must be properly sign extended.
+    SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+      RHS = ZExtPromotedInteger(RHS);
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
+
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  // The input value must be properly sign extended.
+  SDValue LHS = VPSExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = VPZExtPromotedInteger(RHS, Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
-  // The input value must be properly zero extended.
-  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
-  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
-    RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_SRL)
+  if (N->getOpcode() != ISD::VP_SRL) {
+    // The input value must be properly zero extended.
+    SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+      RHS = ZExtPromotedInteger(RHS);
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  }
+
+  SDValue Mask = N->getOperand(2);
+  SDValue EVL = N->getOperand(3);
+  // The input value must be properly zero extended.
+  SDValue LHS = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = VPZExtPromotedInteger(RHS, Mask, EVL);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
-                     N->getOperand(2), N->getOperand(3));
+                     Mask, EVL);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) {
@@ -1487,7 +1530,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VPFunnelShift(SDNode *N) {
   SDValue Mask = N->getOperand(3);
   SDValue EVL = N->getOperand(4);
   if (getTypeAction(Amt.getValueType()) == TargetLowering::TypePromoteInteger)
-    Amt = ZExtPromotedInteger(Amt);
+    Amt = VPZExtPromotedInteger(Amt, Mask, EVL);
   EVT AmtVT = Amt.getValueType();
 
   SDLoc DL(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d925089d5689f..ba3c7582d5a8a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -275,6 +275,27 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
     return DAG.getZeroExtendInReg(Op, dl, OldVT);
   }
 
+  /// Get a promoted operand and zero extend it to the final size.
+  SDValue VPSExtPromotedInteger(SDValue Op, SDValue Mask, SDValue EVL) {
+    EVT OldVT = Op.getValueType();
+    SDLoc dl(Op);
+    Op = GetPromotedInteger(Op);
+    // FIXME: Add VP_SIGN_EXTEND_INREG.
+    EVT VT = Op.getValueType();
+    unsigned BitsDiff = VT.getScalarSizeInBits() - OldVT.getScalarSizeInBits();
+    SDValue ShiftCst = DAG.getShiftAmountConstant(BitsDiff, VT, dl);
+    SDValue Shl = DAG.getNode(ISD::VP_SHL, dl, VT, Op, ShiftCst, Mask, EVL);
+    return DAG.getNode(ISD::VP_SRA, dl, VT, Shl, ShiftCst, Mask, EVL);
+  }
+
+  /// Get a promoted operand and zero extend it to the final size.
+  SDValue VPZExtPromotedInteger(SDValue Op, SDValue Mask, SDValue EVL) {
+    EVT OldVT = Op.getValueType();
+    SDLoc dl(Op);
+    Op = GetPromotedInteger(Op);
+    return DAG.getVPZeroExtendInReg(Op, Mask, EVL, dl, OldVT);
+  }
+
   // Promote the given operand V (vector or scalar) according to N's specific
   // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns
   // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 40e621f0db220..361416edb554c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1174,8 +1174,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FADD: case ISD::VP_FADD:
   case ISD::FSUB: case ISD::VP_FSUB:
   case ISD::FMUL: case ISD::VP_FMUL:
-  case ISD::FMINNUM: case ISD::VP_FMINNUM:
-  case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
+  case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::VP_FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::VP_FMAXNUM:
   case ISD::FMINIMUM:
   case ISD::VP_FMINIMUM:
   case ISD::FMAXIMUM:
@@ -4237,8 +4241,12 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SHL: case ISD::VP_SHL:
   case ISD::SRA: case ISD::VP_SRA:
   case ISD::SRL: case ISD::VP_SRL:
-  case ISD::FMINNUM: case ISD::VP_FMINNUM:
-  case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
+  case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::VP_FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::VP_FMAXNUM:
   case ISD::FMINIMUM:
   case ISD::VP_FMINIMUM:
   case ISD::FMAXIMUM:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4e47f50ee4289..623b6343994a4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8428,6 +8428,7 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
   EVT VT = N->getValueType(0);
   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   bool IsMax = Opc == ISD::FMAXIMUM;
+  SDNodeFlags Flags = N->getFlags();
 
   if (VT.isVector() &&
       isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType()))
@@ -8444,15 +8445,15 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
   bool MinMaxMustRespectOrderedZero = false;
 
   if (isOperationLegalOrCustom(CompOpcIeee, VT)) {
-    MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS);
+    MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS, Flags);
     MinMaxMustRespectOrderedZero = true;
   } else if (isOperationLegalOrCustom(CompOpc, VT)) {
-    MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS);
+    MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags);
   } else {
     // NaN (if exists) will be propagated later, so orderness doesn't matter.
     SDValue Compare =
         DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
-    MinMax = DAG.getSelect(DL, VT, Compare, LHS, RHS);
+    MinMax = DAG.getSelect(DL, VT, Compare, LHS, RHS, Flags);
   }
 
   // Propagate any NaN of both operands
@@ -8461,7 +8462,7 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
     ConstantFP *FPNaN = ConstantFP::get(
         *DAG.getContext(), APFloat::getNaN(DAG.EVTToAPFloatSemantics(VT)));
     MinMax = DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, LHS, RHS, ISD::SETUO),
-                           DAG.getConstantFP(*FPNaN, DL, VT), MinMax);
+                           DAG.getConstantFP(*FPNaN, DL, VT), MinMax, Flags);
   }
 
   // fminimum/fmaximum requires -0.0 less than +0.0
@@ -8473,11 +8474,11 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
         DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32);
     SDValue LCmp = DAG.getSelect(
         DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS,
-        MinMax);
+        MinMax, Flags);
     SDValue RCmp = DAG.getSelect(
         DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS,
-        LCmp);
-    MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax);
+        LCmp, Flags);
+    MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags);
   }
 
   return MinMax;
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 3d5c58d282da5..df1c02c3dc67c 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -181,6 +181,7 @@ std::string EVT::getEVTString() const {
   case MVT::Metadata:  return "Metadata";
   case MVT::Untyped:   return "Untyped";
   case MVT::funcref:   return "funcref";
+  case MVT::exnref:    return "exnref";
   case MVT::externref: return "externref";
   case MVT::aarch64svcount:
     return "aarch64svcount";
diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index 1a9e1ba869c31..16c1dcb1e1175 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -252,12 +252,11 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) {
       M.getOrInsertGlobal("__wasm_lpad_context", LPadContextTy));
   LPadContextGV->setThreadLocalMode(GlobalValue::GeneralDynamicTLSModel);
 
-  LPadIndexField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 0,
-                                          "lpad_index_gep");
-  LSDAField =
-      IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 1, "lsda_gep");
-  SelectorField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 2,
-                                         "selector_gep");
+  LPadIndexField = LPadContextGV;
+  LSDAField = IRB.CreateConstInBoundsGEP2_32(LPadContextTy, LPadContextGV, 0, 1,
+                                             "lsda_gep");
+  SelectorField = IRB.CreateConstInBoundsGEP2_32(LPadContextTy, LPadContextGV,
+                                                 0, 2, "selector_gep");
 
   // wasm.landingpad.index() intrinsic, which is to specify landingpad index
   LPadIndexF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_landingpad_index);
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index eaf8c35142def..0046220611203 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -1129,7 +1129,8 @@ uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType,
 bool RuntimeDyldELF::resolveAArch64ShortBranch(
     unsigned SectionID, relocation_iterator RelI,
     const RelocationValueRef &Value) {
-  uint64_t Address;
+  uint64_t TargetOffset;
+  unsigned TargetSectionID;
   if (Value.SymbolName) {
     auto Loc = GlobalSymbolTable.find(Value.SymbolName);
 
@@ -1138,23 +1139,32 @@ bool RuntimeDyldELF::resolveAArch64ShortBranch(
       return false;
 
     const auto &SymInfo = Loc->second;
-    Address =
-        uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset(
-            SymInfo.getOffset()));
+
+    TargetSectionID = SymInfo.getSectionID();
+    TargetOffset = SymInfo.getOffset();
   } else {
-    Address = uint64_t(Sections[Value.SectionID].getLoadAddress());
+    TargetSectionID = Value.SectionID;
+    TargetOffset = 0;
   }
-  uint64_t Offset = RelI->getOffset();
-  uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset);
+
+  // We don't actually know the load addresses at this point, so if the
+  // branch is cross-section, we don't know exactly how far away it is.
+  if (TargetSectionID != SectionID)
+    return false;
+
+  uint64_t SourceOffset = RelI->getOffset();
 
   // R_AARCH64_CALL26 requires immediate to be in range -2^27 <= imm < 2^27
   // If distance between source and target is out of range then we should
   // create thunk.
-  if (!isInt<28>(Address + Value.Addend - SourceAddress))
+  if (!isInt<28>(TargetOffset + Value.Addend - SourceOffset))
     return false;
 
-  resolveRelocation(Sections[SectionID], Offset, Address, RelI->getType(),
-                    Value.Addend);
+  RelocationEntry RE(SectionID, SourceOffset, RelI->getType(), Value.Addend);
+  if (Value.SymbolName)
+    addRelocationForSymbol(RE, Value.SymbolName);
+  else
+    addRelocationForSection(RE, Value.SectionID);
 
   return true;
 }
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index ced5d78f994ab..8b1a21f962b08 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1594,6 +1594,27 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
+  if (const ConstantPtrAuth *CPA = dyn_cast<ConstantPtrAuth>(CV)) {
+    Out << "ptrauth (";
+
+    // ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?)
+    unsigned NumOpsToWrite = 2;
+    if (!CPA->getOperand(2)->isNullValue())
+      NumOpsToWrite = 3;
+    if (!CPA->getOperand(3)->isNullValue())
+      NumOpsToWrite = 4;
+
+    ListSeparator LS;
+    for (unsigned i = 0, e = NumOpsToWrite; i != e; ++i) {
+      Out << LS;
+      WriterCtx.TypePrinter->print(CPA->getOperand(i)->getType(), Out);
+      Out << ' ';
+      WriteAsOperandInternal(Out, CPA->getOperand(i), WriterCtx);
+    }
+    Out << ')';
+    return;
+  }
+
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
     Type *ETy = CA->getType()->getElementType();
     Out << '[';
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index cfb89d557db47..119fcb4fa0346 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -550,6 +550,9 @@ void llvm::deleteConstant(Constant *C) {
   case Constant::NoCFIValueVal:
     delete static_cast<NoCFIValue *>(C);
     break;
+  case Constant::ConstantPtrAuthVal:
+    delete static_cast<ConstantPtrAuth *>(C);
+    break;
   case Constant::UndefValueVal:
     delete static_cast<UndefValue *>(C);
     break;
@@ -2015,6 +2018,124 @@ Value *NoCFIValue::handleOperandChangeImpl(Value *From, Value *To) {
   return nullptr;
 }
 
+//---- ConstantPtrAuth::get() implementations.
+//
+
+ConstantPtrAuth *ConstantPtrAuth::get(Constant *Ptr, ConstantInt *Key,
+                                      ConstantInt *Disc, Constant *AddrDisc) {
+  Constant *ArgVec[] = {Ptr, Key, Disc, AddrDisc};
+  ConstantPtrAuthKeyType MapKey(ArgVec);
+  LLVMContextImpl *pImpl = Ptr->getContext().pImpl;
+  return pImpl->ConstantPtrAuths.getOrCreate(Ptr->getType(), MapKey);
+}
+
+ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const {
+  return get(Pointer, getKey(), getDiscriminator(), getAddrDiscriminator());
+}
+
+ConstantPtrAuth::ConstantPtrAuth(Constant *Ptr, ConstantInt *Key,
+                                 ConstantInt *Disc, Constant *AddrDisc)
+    : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, &Op<0>(), 4) {
+  assert(Ptr->getType()->isPointerTy());
+  assert(Key->getBitWidth() == 32);
+  assert(Disc->getBitWidth() == 64);
+  assert(AddrDisc->getType()->isPointerTy());
+  setOperand(0, Ptr);
+  setOperand(1, Key);
+  setOperand(2, Disc);
+  setOperand(3, AddrDisc);
+}
+
+/// Remove the constant from the constant table.
+void ConstantPtrAuth::destroyConstantImpl() {
+  getType()->getContext().pImpl->ConstantPtrAuths.remove(this);
+}
+
+Value *ConstantPtrAuth::handleOperandChangeImpl(Value *From, Value *ToV) {
+  assert(isa<Constant>(ToV) && "Cannot make Constant refer to non-constant!");
+  Constant *To = cast<Constant>(ToV);
+
+  SmallVector<Constant *, 4> Values;
+  Values.reserve(getNumOperands());
+
+  unsigned NumUpdated = 0;
+
+  Use *OperandList = getOperandList();
+  unsigned OperandNo = 0;
+  for (Use *O = OperandList, *E = OperandList + getNumOperands(); O != E; ++O) {
+    Constant *Val = cast<Constant>(O->get());
+    if (Val == From) {
+      OperandNo = (O - OperandList);
+      Val = To;
+      ++NumUpdated;
+    }
+    Values.push_back(Val);
+  }
+
+  return getContext().pImpl->ConstantPtrAuths.replaceOperandsInPlace(
+      Values, this, From, To, NumUpdated, OperandNo);
+}
+
+bool ConstantPtrAuth::isKnownCompatibleWith(const Value *Key,
+                                            const Value *Discriminator,
+                                            const DataLayout &DL) const {
+  // If the keys are different, there's no chance for this to be compatible.
+  if (getKey() != Key)
+    return false;
+
+  // We can have 3 kinds of discriminators:
+  // - simple, integer-only:    `i64 x, ptr null` vs. `i64 x`
+  // - address-only:            `i64 0, ptr p` vs. `ptr p`
+  // - blended address/integer: `i64 x, ptr p` vs. `@llvm.ptrauth.blend(p, x)`
+
+  // If this constant has a simple discriminator (integer, no address), easy:
+  // it's compatible iff the provided full discriminator is also a simple
+  // discriminator, identical to our integer discriminator.
+  if (!hasAddressDiscriminator())
+    return getDiscriminator() == Discriminator;
+
+  // Otherwise, we can isolate address and integer discriminator components.
+  const Value *AddrDiscriminator = nullptr;
+
+  // This constant may or may not have an integer discriminator (instead of 0).
+  if (!getDiscriminator()->isNullValue()) {
+    // If it does, there's an implicit blend.  We need to have a matching blend
+    // intrinsic in the provided full discriminator.
+    if (!match(Discriminator,
+               m_Intrinsic<Intrinsic::ptrauth_blend>(
+                   m_Value(AddrDiscriminator), m_Specific(getDiscriminator()))))
+      return false;
+  } else {
+    // Otherwise, interpret the provided full discriminator as address-only.
+    AddrDiscriminator = Discriminator;
+  }
+
+  // Either way, we can now focus on comparing the address discriminators.
+
+  // Discriminators are i64, so the provided addr disc may be a ptrtoint.
+  if (auto *Cast = dyn_cast<PtrToIntOperator>(AddrDiscriminator))
+    AddrDiscriminator = Cast->getPointerOperand();
+
+  // Beyond that, we're only interested in compatible pointers.
+  if (getAddrDiscriminator()->getType() != AddrDiscriminator->getType())
+    return false;
+
+  // These are often the same constant GEP, making them trivially equivalent.
+  if (getAddrDiscriminator() == AddrDiscriminator)
+    return true;
+
+  // Finally, they may be equivalent base+offset expressions.
+  APInt Off1(DL.getIndexTypeSizeInBits(getAddrDiscriminator()->getType()), 0);
+  auto *Base1 = getAddrDiscriminator()->stripAndAccumulateConstantOffsets(
+      DL, Off1, /*AllowNonInbounds=*/true);
+
+  APInt Off2(DL.getIndexTypeSizeInBits(AddrDiscriminator->getType()), 0);
+  auto *Base2 = AddrDiscriminator->stripAndAccumulateConstantOffsets(
+      DL, Off2, /*AllowNonInbounds=*/true);
+
+  return Base1 == Base2 && Off1 == Off2;
+}
+
 //---- ConstantExpr::get() implementations.
 //
 
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index 7067d0d121117..5153880b5cab6 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -23,6 +23,7 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -286,6 +287,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CompareConstantExpr, Value)
 template <class ConstantClass> struct ConstantAggrKeyType;
 struct InlineAsmKeyType;
 struct ConstantExprKeyType;
+struct ConstantPtrAuthKeyType;
 
 template <class ConstantClass> struct ConstantInfo;
 template <> struct ConstantInfo<ConstantExpr> {
@@ -308,6 +310,10 @@ template <> struct ConstantInfo<ConstantVector> {
   using ValType = ConstantAggrKeyType<ConstantVector>;
   using TypeClass = VectorType;
 };
+template <> struct ConstantInfo<ConstantPtrAuth> {
+  using ValType = ConstantPtrAuthKeyType;
+  using TypeClass = Type;
+};
 
 template <class ConstantClass> struct ConstantAggrKeyType {
   ArrayRef<Constant *> Operands;
@@ -536,6 +542,47 @@ struct ConstantExprKeyType {
   }
 };
 
+struct ConstantPtrAuthKeyType {
+  ArrayRef<Constant *> Operands;
+
+  ConstantPtrAuthKeyType(ArrayRef<Constant *> Operands) : Operands(Operands) {}
+
+  ConstantPtrAuthKeyType(ArrayRef<Constant *> Operands, const ConstantPtrAuth *)
+      : Operands(Operands) {}
+
+  ConstantPtrAuthKeyType(const ConstantPtrAuth *C,
+                         SmallVectorImpl<Constant *> &Storage) {
+    assert(Storage.empty() && "Expected empty storage");
+    for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I)
+      Storage.push_back(cast<Constant>(C->getOperand(I)));
+    Operands = Storage;
+  }
+
+  bool operator==(const ConstantPtrAuthKeyType &X) const {
+    return Operands == X.Operands;
+  }
+
+  bool operator==(const ConstantPtrAuth *C) const {
+    if (Operands.size() != C->getNumOperands())
+      return false;
+    for (unsigned I = 0, E = Operands.size(); I != E; ++I)
+      if (Operands[I] != C->getOperand(I))
+        return false;
+    return true;
+  }
+
+  unsigned getHash() const {
+    return hash_combine_range(Operands.begin(), Operands.end());
+  }
+
+  using TypeClass = typename ConstantInfo<ConstantPtrAuth>::TypeClass;
+
+  ConstantPtrAuth *create(TypeClass *Ty) const {
+    return new ConstantPtrAuth(Operands[0], cast<ConstantInt>(Operands[1]),
+                               cast<ConstantInt>(Operands[2]), Operands[3]);
+  }
+};
+
 // Free memory for a given constant.  Assumes the constant has already been
 // removed from all relevant maps.
 void deleteConstant(Constant *C);
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index bd06ff82a15a5..13fa1afeaaff2 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -79,7 +79,7 @@ using ProfileCount = Function::ProfileCount;
 // are not in the public header file...
 template class llvm::SymbolTableListTraits<BasicBlock>;
 
-static cl::opt<unsigned> NonGlobalValueMaxNameSize(
+static cl::opt<int> NonGlobalValueMaxNameSize(
     "non-global-value-max-name-size", cl::Hidden, cl::init(1024),
     cl::desc("Maximum size for the name of non-global values."));
 
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 399fe0dad26c7..392e0d16f1761 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1562,6 +1562,8 @@ class LLVMContextImpl {
 
   DenseMap<const GlobalValue *, NoCFIValue *> NoCFIValues;
 
+  ConstantUniqueMap<ConstantPtrAuth> ConstantPtrAuths;
+
   ConstantUniqueMap<ConstantExpr> ExprConstants;
 
   ConstantUniqueMap<InlineAsm> InlineAsms;
diff --git a/llvm/lib/IR/ValueSymbolTable.cpp b/llvm/lib/IR/ValueSymbolTable.cpp
index 52f7ddcdc65a2..a020acf22a96c 100644
--- a/llvm/lib/IR/ValueSymbolTable.cpp
+++ b/llvm/lib/IR/ValueSymbolTable.cpp
@@ -43,23 +43,34 @@ ValueSymbolTable::~ValueSymbolTable() {
 ValueName *ValueSymbolTable::makeUniqueName(Value *V,
                                             SmallString<256> &UniqueName) {
   unsigned BaseSize = UniqueName.size();
+  bool AppenDot = false;
+  if (auto *GV = dyn_cast<GlobalValue>(V)) {
+    // A dot is appended to mark it as clone during ABI demangling so that
+    // for example "_Z1fv" and "_Z1fv.1" both demangle to "f()", the second
+    // one being a clone.
+    // On NVPTX we cannot use a dot because PTX only allows [A-Za-z0-9_$] for
+    // identifiers. This breaks ABI demangling but at least ptxas accepts and
+    // compiles the program.
+    const Module *M = GV->getParent();
+    if (!(M && Triple(M->getTargetTriple()).isNVPTX()))
+      AppenDot = true;
+  }
+
   while (true) {
     // Trim any suffix off and append the next number.
     UniqueName.resize(BaseSize);
     raw_svector_ostream S(UniqueName);
-    if (auto *GV = dyn_cast<GlobalValue>(V)) {
-      // A dot is appended to mark it as clone during ABI demangling so that
-      // for example "_Z1fv" and "_Z1fv.1" both demangle to "f()", the second
-      // one being a clone.
-      // On NVPTX we cannot use a dot because PTX only allows [A-Za-z0-9_$] for
-      // identifiers. This breaks ABI demangling but at least ptxas accepts and
-      // compiles the program.
-      const Module *M = GV->getParent();
-      if (!(M && Triple(M->getTargetTriple()).isNVPTX()))
-        S << ".";
-    }
+    if (AppenDot)
+      S << ".";
     S << ++LastUnique;
 
+    // Retry if MaxNameSize has been exceeded.
+    if (MaxNameSize > -1 && UniqueName.size() > (size_t)MaxNameSize) {
+      assert(BaseSize >= UniqueName.size() - (size_t)MaxNameSize &&
+             "Can't generate unique name: MaxNameSize is too small.");
+      BaseSize -= UniqueName.size() - (size_t)MaxNameSize;
+      continue;
+    }
     // Try insert the vmap entry with this suffix.
     auto IterBool = vmap.insert(std::make_pair(UniqueName.str(), V));
     if (IterBool.second)
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 50f8d6ec84201..684e54444621b 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -629,6 +629,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
 
   void visitConstantExprsRecursively(const Constant *EntryC);
   void visitConstantExpr(const ConstantExpr *CE);
+  void visitConstantPtrAuth(const ConstantPtrAuth *CPA);
   void verifyInlineAsmCall(const CallBase &Call);
   void verifyStatepoint(const CallBase &Call);
   void verifyFrameRecoverIndices();
@@ -2422,6 +2423,9 @@ void Verifier::visitConstantExprsRecursively(const Constant *EntryC) {
     if (const auto *CE = dyn_cast<ConstantExpr>(C))
       visitConstantExpr(CE);
 
+    if (const auto *CPA = dyn_cast<ConstantPtrAuth>(C))
+      visitConstantPtrAuth(CPA);
+
     if (const auto *GV = dyn_cast<GlobalValue>(C)) {
       // Global Values get visited separately, but we do need to make sure
       // that the global value is in the correct module
@@ -2449,6 +2453,23 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
           "Invalid bitcast", CE);
 }
 
+void Verifier::visitConstantPtrAuth(const ConstantPtrAuth *CPA) {
+  Check(CPA->getPointer()->getType()->isPointerTy(),
+        "signed ptrauth constant base pointer must have pointer type");
+
+  Check(CPA->getType() == CPA->getPointer()->getType(),
+        "signed ptrauth constant must have same type as its base pointer");
+
+  Check(CPA->getKey()->getBitWidth() == 32,
+        "signed ptrauth constant key must be i32 constant integer");
+
+  Check(CPA->getAddrDiscriminator()->getType()->isPointerTy(),
+        "signed ptrauth constant address discriminator must be a pointer");
+
+  Check(CPA->getDiscriminator()->getBitWidth() == 64,
+        "signed ptrauth constant discriminator must be i64 constant integer");
+}
+
 bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) {
   // There shouldn't be more attribute sets than there are parameters plus the
   // function and return value.
@@ -5090,6 +5111,8 @@ void Verifier::visitInstruction(Instruction &I) {
     } else if (isa<InlineAsm>(I.getOperand(i))) {
       Check(CBI && &CBI->getCalledOperandUse() == &I.getOperandUse(i),
             "Cannot take the address of an inline asm!", &I);
+    } else if (auto *CPA = dyn_cast<ConstantPtrAuth>(I.getOperand(i))) {
+      visitConstantExprsRecursively(CPA);
     } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(I.getOperand(i))) {
       if (CE->getType()->isPtrOrPtrVectorTy()) {
         // If we have a ConstantExpr pointer, we need to see if it came from an
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 2cddaf330b3bc..8014ef9d03948 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -2580,7 +2580,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro,
         OS << NumOfMacroInstantiations;
         Pos += 2;
       } else if (Argument == "+") {
-        OS << Macro.Count++;
+        OS << Macro.Count;
         Pos += 2;
       } else {
         for (; Index < NParameters; ++Index)
@@ -2629,6 +2629,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro,
     Body = Body.substr(Pos);
   }
 
+  ++Macro.Count;
   return false;
 }
 
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 6507a0e5950eb..23381955c60a8 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -177,8 +177,8 @@ static uint8_t readOpcode(WasmObjectFile::ReadContext &Ctx) {
 
 static wasm::ValType parseValType(WasmObjectFile::ReadContext &Ctx,
                                   uint32_t Code) {
-  // only directly encoded FUNCREF/EXTERNREF are supported
-  // (not ref null func or ref null extern)
+  // only directly encoded FUNCREF/EXTERNREF/EXNREF are supported
+  // (not ref null func, ref null extern, or ref null exn)
   switch (Code) {
   case wasm::WASM_TYPE_I32:
   case wasm::WASM_TYPE_I64:
@@ -187,6 +187,7 @@ static wasm::ValType parseValType(WasmObjectFile::ReadContext &Ctx,
   case wasm::WASM_TYPE_V128:
   case wasm::WASM_TYPE_FUNCREF:
   case wasm::WASM_TYPE_EXTERNREF:
+  case wasm::WASM_TYPE_EXNREF:
     return wasm::ValType(Code);
   }
   if (Code == wasm::WASM_TYPE_NULLABLE || Code == wasm::WASM_TYPE_NONNULLABLE) {
@@ -1288,6 +1289,7 @@ Error WasmObjectFile::parseImportSection(ReadContext &Ctx) {
       auto ElemType = Im.Table.ElemType;
       if (ElemType != wasm::ValType::FUNCREF &&
           ElemType != wasm::ValType::EXTERNREF &&
+          ElemType != wasm::ValType::EXNREF &&
           ElemType != wasm::ValType::OTHERREF)
         return make_error<GenericBinaryError>("invalid table element type",
                                               object_error::parse_failed);
@@ -1346,6 +1348,7 @@ Error WasmObjectFile::parseTableSection(ReadContext &Ctx) {
     auto ElemType = Tables.back().Type.ElemType;
     if (ElemType != wasm::ValType::FUNCREF &&
         ElemType != wasm::ValType::EXTERNREF &&
+        ElemType != wasm::ValType::EXNREF &&
         ElemType != wasm::ValType::OTHERREF) {
       return make_error<GenericBinaryError>("invalid table element type",
                                             object_error::parse_failed);
@@ -1680,6 +1683,7 @@ Error WasmObjectFile::parseElemSection(ReadContext &Ctx) {
         Segment.ElemKind = parseValType(Ctx, ElemKind);
         if (Segment.ElemKind != wasm::ValType::FUNCREF &&
             Segment.ElemKind != wasm::ValType::EXTERNREF &&
+            Segment.ElemKind != wasm::ValType::EXNREF &&
             Segment.ElemKind != wasm::ValType::OTHERREF) {
           return make_error<GenericBinaryError>("invalid elem type",
                                                 object_error::parse_failed);
diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp
index 544a91d03dce0..7ad338f65706d 100644
--- a/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -606,6 +606,7 @@ void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
   ECase(V128);
   ECase(FUNCREF);
   ECase(EXTERNREF);
+  ECase(EXNREF);
   ECase(OTHERREF);
 #undef ECase
 }
@@ -640,6 +641,7 @@ void ScalarEnumerationTraits<WasmYAML::TableType>::enumeration(
 #define ECase(X) IO.enumCase(Type, #X, CONCAT(X));
   ECase(FUNCREF);
   ECase(EXTERNREF);
+  ECase(EXNREF);
   ECase(OTHERREF);
 #undef ECase
 }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 836206a4fd86e..798236c295194 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1212,7 +1212,8 @@ Error IndexedMemProfReader::deserialize(const unsigned char *Start,
   const uint64_t FirstWord =
       support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
 
-  if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2) {
+  if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2 ||
+      FirstWord == memprof::Version3) {
     // Everything is good.  We can proceed to deserialize the rest.
     Version = static_cast<memprof::IndexedVersion>(FirstWord);
   } else if (FirstWord >= 24) {
@@ -1559,6 +1560,7 @@ IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const {
            "MemProfCallStackTable must not be available");
     return getMemProfRecordV0(IndexedRecord, *MemProfFrameTable);
   case memprof::Version2:
+  case memprof::Version3:
     assert(MemProfFrameTable && "MemProfFrameTable must be available");
     assert(MemProfCallStackTable && "MemProfCallStackTable must be available");
     return getMemProfRecordV2(IndexedRecord, *MemProfFrameTable,
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index b67a9700b680a..b16714ae8b9a2 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -617,6 +617,56 @@ static Error writeMemProfV2(ProfOStream &OS,
   return Error::success();
 }
 
+// Write out MemProf Version3 as follows:
+// uint64_t Version
+// uint64_t RecordTableOffset = RecordTableGenerator.Emit
+// uint64_t FramePayloadOffset = Offset for the frame payload
+// uint64_t FrameTableOffset = FrameTableGenerator.Emit
+// uint64_t CallStackPayloadOffset = Offset for the call stack payload
+// uint64_t CallStackTableOffset = CallStackTableGenerator.Emit
+// uint64_t Num schema entries
+// uint64_t Schema entry 0
+// uint64_t Schema entry 1
+// ....
+// uint64_t Schema entry N - 1
+// OnDiskChainedHashTable MemProfRecordData
+// OnDiskChainedHashTable MemProfFrameData
+// OnDiskChainedHashTable MemProfCallStackData
+static Error writeMemProfV3(ProfOStream &OS,
+                            memprof::IndexedMemProfData &MemProfData,
+                            bool MemProfFullSchema) {
+  OS.write(memprof::Version3);
+  uint64_t HeaderUpdatePos = OS.tell();
+  OS.write(0ULL); // Reserve space for the memprof record table offset.
+  OS.write(0ULL); // Reserve space for the memprof frame payload offset.
+  OS.write(0ULL); // Reserve space for the memprof frame table offset.
+  OS.write(0ULL); // Reserve space for the memprof call stack payload offset.
+  OS.write(0ULL); // Reserve space for the memprof call stack table offset.
+
+  auto Schema = memprof::getHotColdSchema();
+  if (MemProfFullSchema)
+    Schema = memprof::getFullSchema();
+  writeMemProfSchema(OS, Schema);
+
+  uint64_t RecordTableOffset = writeMemProfRecords(OS, MemProfData.RecordData,
+                                                   &Schema, memprof::Version3);
+
+  uint64_t FramePayloadOffset = OS.tell();
+  uint64_t FrameTableOffset = writeMemProfFrames(OS, MemProfData.FrameData);
+
+  uint64_t CallStackPayloadOffset = OS.tell();
+  uint64_t CallStackTableOffset =
+      writeMemProfCallStacks(OS, MemProfData.CallStackData);
+
+  uint64_t Header[] = {
+      RecordTableOffset,      FramePayloadOffset,   FrameTableOffset,
+      CallStackPayloadOffset, CallStackTableOffset,
+  };
+  OS.patch({{HeaderUpdatePos, Header, std::size(Header)}});
+
+  return Error::success();
+}
+
 // Write out the MemProf data in a requested version.
 static Error writeMemProf(ProfOStream &OS,
                           memprof::IndexedMemProfData &MemProfData,
@@ -629,6 +679,8 @@ static Error writeMemProf(ProfOStream &OS,
     return writeMemProfV1(OS, MemProfData);
   case memprof::Version2:
     return writeMemProfV2(OS, MemProfData, MemProfFullSchema);
+  case memprof::Version3:
+    return writeMemProfV3(OS, MemProfData, MemProfFullSchema);
   }
 
   return make_error<InstrProfError>(
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index e5608644519db..2f0e53736c82e 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -52,6 +52,7 @@ size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema,
   case Version1:
     return serializedSizeV0(*this, Schema);
   case Version2:
+  case Version3:
     return serializedSizeV2(*this, Schema);
   }
   llvm_unreachable("unsupported MemProf version");
@@ -95,6 +96,7 @@ size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema,
   case Version1:
     return serializedSizeV0(*this, Schema);
   case Version2:
+  case Version3:
     return serializedSizeV2(*this, Schema);
   }
   llvm_unreachable("unsupported MemProf version");
@@ -149,6 +151,7 @@ void IndexedMemProfRecord::serialize(const MemProfSchema &Schema,
     serializeV0(*this, Schema, OS);
     return;
   case Version2:
+  case Version3:
     serializeV2(*this, Schema, OS);
     return;
   }
@@ -239,14 +242,15 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
   case Version1:
     return deserializeV0(Schema, Ptr);
   case Version2:
+  case Version3:
     return deserializeV2(Schema, Ptr);
   }
   llvm_unreachable("unsupported MemProf version");
 }
 
 MemProfRecord IndexedMemProfRecord::toMemProfRecord(
-    llvm::function_ref<const llvm::SmallVector<Frame>(const CallStackId)>
-        Callback) const {
+    llvm::function_ref<llvm::SmallVector<Frame>(const CallStackId)> Callback)
+    const {
   MemProfRecord Record;
 
   Record.AllocSites.reserve(AllocSites.size());
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 2a9b3903720be..283fcc153b33a 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -3665,6 +3665,15 @@ double IEEEFloat::convertToDouble() const {
   return api.bitsToDouble();
 }
 
+#ifdef HAS_IEE754_FLOAT128
+float128 IEEEFloat::convertToQuad() const {
+  assert(semantics == (const llvm::fltSemantics *)&semIEEEquad &&
+         "Float semantics are not IEEEquads");
+  APInt api = bitcastToAPInt();
+  return api.bitsToQuad();
+}
+#endif
+
 /// Integer bit is explicit in this format.  Intel hardware (387 and later)
 /// does not support these bit patterns:
 ///  exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity")
@@ -5260,6 +5269,21 @@ double APFloat::convertToDouble() const {
   return Temp.getIEEE().convertToDouble();
 }
 
+#ifdef HAS_IEE754_FLOAT128
+float128 APFloat::convertToQuad() const {
+  if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad)
+    return getIEEE().convertToQuad();
+  assert(getSemantics().isRepresentableBy(semIEEEquad) &&
+         "Float semantics is not representable by IEEEquad");
+  APFloat Temp = *this;
+  bool LosesInfo;
+  opStatus St = Temp.convert(semIEEEquad, rmNearestTiesToEven, &LosesInfo);
+  assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision");
+  (void)St;
+  return Temp.getIEEE().convertToQuad();
+}
+#endif
+
 float APFloat::convertToFloat() const {
   if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEsingle)
     return getIEEE().convertToFloat();
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1c7f6b870d390..3f717c8a60050 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -52,6 +52,19 @@ def ext_uaddv_to_uaddlv : GICombineRule<
   (apply [{ applyExtUaddvToUaddlv(*${root}, MRI, B, Observer, ${matchinfo}); }])
 >;
 
+class push_opcode_through_ext<Instruction opcode, Instruction extOpcode> : GICombineRule <
+  (defs root:$root),
+  (match (extOpcode $ext1, $src1):$ExtMI,
+         (extOpcode $ext2, $src2),
+         (opcode $dst, $ext1, $ext2):$root,
+         [{ return matchPushAddSubExt(*${root}, MRI, ${dst}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]),
+  (apply [{ applyPushAddSubExt(*${root}, MRI, B, ${ExtMI}->getOpcode() == TargetOpcode::G_SEXT, ${dst}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }])>;
+
+def push_sub_through_zext : push_opcode_through_ext<G_SUB, G_ZEXT>;
+def push_add_through_zext : push_opcode_through_ext<G_ADD, G_ZEXT>;
+def push_sub_through_sext : push_opcode_through_ext<G_SUB, G_SEXT>;
+def push_add_through_sext : push_opcode_through_ext<G_ADD, G_SEXT>;
+
 def AArch64PreLegalizerCombiner: GICombiner<
   "AArch64PreLegalizerCombinerImpl", [all_combines,
                                       fconstant_to_constant,
@@ -59,7 +72,11 @@ def AArch64PreLegalizerCombiner: GICombiner<
                                       fold_global_offset,
                                       shuffle_to_extract,
                                       ext_addv_to_udot_addv,
-                                      ext_uaddv_to_uaddlv]> {
+                                      ext_uaddv_to_uaddlv,
+                                      push_sub_through_zext,
+                                      push_add_through_zext,
+                                      push_sub_through_sext,
+                                      push_add_through_sext]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
@@ -265,6 +282,14 @@ def or_to_bsp: GICombineRule <
   (apply [{ applyOrToBSP(*${root}, MRI, B, ${matchinfo}); }])
 >;
 
+// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
+def combine_mul_cmlt : GICombineRule<
+  (defs root:$root, register_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_MUL):$root,
+        [{ return matchCombineMulCMLT(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
 // Post-legalization combines which should happen at all optimization levels.
 // (E.g. ones that facilitate matching for the selector) For example, matching
 // pseudos.
@@ -296,5 +321,6 @@ def AArch64PostLegalizerCombiner
                         split_store_zero_128, undef_combines,
                         select_to_minmax, or_to_bsp, combine_concat_vector,
                         commute_constant_to_rhs,
-                        push_freeze_to_prevent_poison_from_propagating]> {
+                        push_freeze_to_prevent_poison_from_propagating,
+                        combine_mul_cmlt]> {
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 25f2e4d7c4de6..8fd58f4698d28 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5717,6 +5717,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::aarch64_sve_ucvtf_x4:
       SelectCVTIntrinsic(Node, 4, AArch64::UCVTF_4Z4Z_StoS);
       return;
+    case Intrinsic::aarch64_sve_fcvt_widen_x2:
+      SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVT_2ZZ_H_S);
+      return;
+    case Intrinsic::aarch64_sve_fcvtl_widen_x2:
+      SelectUnaryMultiIntrinsic(Node, 2, false, AArch64::FCVTL_2ZZ_H_S);
+      return;
     case Intrinsic::aarch64_sve_sclamp_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
@@ -5738,6 +5744,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::FCLAMP_VG2_2Z2Z_D}))
         SelectClamp(Node, 2, Op);
       return;
+    case Intrinsic::aarch64_sve_bfclamp_single_x2:
+      SelectClamp(Node, 2, AArch64::BFCLAMP_VG2_2ZZZ_H);
+      return;
     case Intrinsic::aarch64_sve_sclamp_single_x4:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
@@ -5759,6 +5768,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::FCLAMP_VG4_4Z4Z_D}))
         SelectClamp(Node, 4, Op);
       return;
+    case Intrinsic::aarch64_sve_bfclamp_single_x4:
+      SelectClamp(Node, 4, AArch64::BFCLAMP_VG4_4ZZZ_H);
+      return;
     case Intrinsic::aarch64_sve_add_single_x2:
       if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
               Node->getValueType(0),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 25ba8d8500306..3e2a5bfbc2321 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -360,24 +360,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   if (Subtarget->hasNEON()) {
     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
-    // Someone set us up the NEON.
-    addDRTypeForNEON(MVT::v2f32);
-    addDRTypeForNEON(MVT::v8i8);
-    addDRTypeForNEON(MVT::v4i16);
-    addDRTypeForNEON(MVT::v2i32);
-    addDRTypeForNEON(MVT::v1i64);
-    addDRTypeForNEON(MVT::v1f64);
-    addDRTypeForNEON(MVT::v4f16);
-    addDRTypeForNEON(MVT::v4bf16);
-
-    addQRTypeForNEON(MVT::v4f32);
-    addQRTypeForNEON(MVT::v2f64);
-    addQRTypeForNEON(MVT::v16i8);
-    addQRTypeForNEON(MVT::v8i16);
-    addQRTypeForNEON(MVT::v4i32);
-    addQRTypeForNEON(MVT::v2i64);
-    addQRTypeForNEON(MVT::v8f16);
-    addQRTypeForNEON(MVT::v8bf16);
+
+    addDRType(MVT::v2f32);
+    addDRType(MVT::v8i8);
+    addDRType(MVT::v4i16);
+    addDRType(MVT::v2i32);
+    addDRType(MVT::v1i64);
+    addDRType(MVT::v1f64);
+    addDRType(MVT::v4f16);
+    addDRType(MVT::v4bf16);
+
+    addQRType(MVT::v4f32);
+    addQRType(MVT::v2f64);
+    addQRType(MVT::v16i8);
+    addQRType(MVT::v8i16);
+    addQRType(MVT::v4i32);
+    addQRType(MVT::v2i64);
+    addQRType(MVT::v8f16);
+    addQRType(MVT::v8bf16);
   }
 
   if (Subtarget->hasSVEorSME()) {
@@ -1125,7 +1125,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
-  if (Subtarget->hasNEON()) {
+  if (Subtarget->isNeonAvailable()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
     // silliness like this:
     for (auto Op :
@@ -1337,6 +1337,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     // FADDP custom lowering
     for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::FADD, VT, Custom);
+  } else /* !isNeonAvailable */ {
+    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
+      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
+        setOperationAction(Op, VT, Expand);
+
+      if (VT.is128BitVector() || VT.is64BitVector()) {
+        setOperationAction(ISD::LOAD, VT, Legal);
+        setOperationAction(ISD::STORE, VT, Legal);
+        setOperationAction(ISD::BITCAST, VT,
+                           Subtarget->isLittleEndian() ? Legal : Expand);
+      }
+      for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
+        setTruncStoreAction(VT, InnerVT, Expand);
+        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+      }
+    }
   }
 
   if (Subtarget->hasSME()) {
@@ -2020,14 +2038,16 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::ZERO_EXTEND, VT, Default);
 }
 
-void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
+void AArch64TargetLowering::addDRType(MVT VT) {
   addRegisterClass(VT, &AArch64::FPR64RegClass);
-  addTypeForNEON(VT);
+  if (Subtarget->isNeonAvailable())
+    addTypeForNEON(VT);
 }
 
-void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
+void AArch64TargetLowering::addQRType(MVT VT) {
   addRegisterClass(VT, &AArch64::FPR128RegClass);
-  addTypeForNEON(VT);
+  if (Subtarget->isNeonAvailable())
+    addTypeForNEON(VT);
 }
 
 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
@@ -9445,7 +9465,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
                                               SelectionDAG &DAG) const {
-  if (!Subtarget->hasNEON())
+  if (!Subtarget->isNeonAvailable() &&
+      !Subtarget->useSVEForFixedLengthVectors())
     return SDValue();
 
   EVT VT = Op.getValueType();
@@ -14141,6 +14162,13 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
 }
 
+bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
+    EVT VT, unsigned DefinedValues) const {
+  if (!Subtarget->isNeonAvailable())
+    return false;
+  return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
+}
+
 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   // Currently no fixed length shuffles that require SVE are legal.
   if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
@@ -14337,7 +14365,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
       unsigned Opc =
           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
       return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, DL, MVT::i32));
+                         DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
     }
 
     // Right shift register.  Note, there is not a shift right register
@@ -19838,7 +19866,8 @@ performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 // help, for example, to produce ssra from sshr+add.
 static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
-  if (VT != MVT::i64)
+  if (VT != MVT::i64 ||
+      DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
     return SDValue();
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index a44a3d35d2f9c..73bc9ad53bb8a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1017,8 +1017,10 @@ class AArch64TargetLowering : public TargetLowering {
 
   void addTypeForNEON(MVT VT);
   void addTypeForFixedLengthSVE(MVT VT);
-  void addDRTypeForNEON(MVT VT);
-  void addQRTypeForNEON(MVT VT);
+  void addDRType(MVT VT);
+  void addQRType(MVT VT);
+
+  bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override;
 
   unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
                                   SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 4830033b23527..dd54520c8ddad 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -733,6 +733,12 @@ def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
 def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
 
 def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
+
+def AArch64vashr_exact : PatFrag<(ops          node:$lhs, node:$rhs),
+                                 (AArch64vashr node:$lhs, node:$rhs), [{
+  return N->getFlags().hasExact();
+}]>;
+
 def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
 def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
 def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
@@ -7710,6 +7716,25 @@ defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
 defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", AArch64rshrn>;
 defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
 
+let Predicates = [HasNEON] in {
+def : Pat<(v2f32 (sint_to_fp (v2i32 (AArch64vashr_exact v2i32:$Vn, i32:$shift)))),
+          (SCVTFv2i32_shift $Vn, vecshiftR32:$shift)>;
+
+def : Pat<(v4f32 (sint_to_fp (v4i32 (AArch64vashr_exact v4i32:$Vn, i32:$shift)))),
+          (SCVTFv4i32_shift $Vn, vecshiftR32:$shift)>;
+
+def : Pat<(v2f64 (sint_to_fp (v2i64 (AArch64vashr_exact v2i64:$Vn, i32:$shift)))),
+          (SCVTFv2i64_shift $Vn, vecshiftR64:$shift)>;
+}
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : Pat<(v4f16 (sint_to_fp (v4i16 (AArch64vashr_exact v4i16:$Vn, i32:$shift)))),
+          (SCVTFv4i16_shift $Vn, vecshiftR16:$shift)>;
+
+def : Pat<(v8f16 (sint_to_fp (v8i16 (AArch64vashr_exact v8i16:$Vn, i32:$shift)))),
+          (SCVTFv8i16_shift $Vn, vecshiftR16:$shift)>;
+}
+
 // X << 1 ==> X + X
 class SHLToADDPat<ValueType ty, RegisterClass regtype>
   : Pat<(ty (AArch64vshl (ty regtype:$Rn), (i32 1))),
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 3b3c1fc8b27bf..4a7c82b393c10 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -227,6 +227,8 @@ class AArch64InstructionSelector : public InstructionSelector {
   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
+  void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs,
+                   unsigned Opc1, unsigned Opc2, bool isExt);
 
   bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -6537,6 +6539,25 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
     I.eraseFromParent();
     return true;
   }
+  case Intrinsic::aarch64_neon_tbl2:
+    SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false);
+    return true;
+  case Intrinsic::aarch64_neon_tbl3:
+    SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three,
+                false);
+    return true;
+  case Intrinsic::aarch64_neon_tbl4:
+    SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false);
+    return true;
+  case Intrinsic::aarch64_neon_tbx2:
+    SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true);
+    return true;
+  case Intrinsic::aarch64_neon_tbx3:
+    SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true);
+    return true;
+  case Intrinsic::aarch64_neon_tbx4:
+    SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true);
+    return true;
   case Intrinsic::swift_async_context_addr:
     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
                               {Register(AArch64::FP)})
@@ -6552,6 +6573,30 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
   return false;
 }
 
+void AArch64InstructionSelector::SelectTable(MachineInstr &I,
+                                             MachineRegisterInfo &MRI,
+                                             unsigned NumVec, unsigned Opc1,
+                                             unsigned Opc2, bool isExt) {
+  Register DstReg = I.getOperand(0).getReg();
+  unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2;
+
+  // Create the REG_SEQUENCE
+  SmallVector<Register, 4> Regs;
+  for (unsigned i = 0; i < NumVec; i++)
+    Regs.push_back(I.getOperand(i + 2 + isExt).getReg());
+  Register RegSeq = createQTuple(Regs, MIB);
+
+  Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg();
+  MachineInstrBuilder Instr;
+  if (isExt) {
+    Register Reg = I.getOperand(2).getReg();
+    Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg});
+  } else
+    Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg});
+  constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI);
+  I.eraseFromParent();
+}
+
 InstructionSelector::ComplexRendererFns
 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
   auto MaybeImmed = getImmedFromMO(Root);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index d8ca5494ba50a..7f3e0e01ccd25 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -381,6 +381,61 @@ void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
   MI.eraseFromParent();
 }
 
+// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
+bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         Register &SrcReg) {
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) &&
+      DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) &&
+      DstTy != LLT::fixed_vector(8, 16))
+    return false;
+
+  auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+  if (AndMI->getOpcode() != TargetOpcode::G_AND)
+    return false;
+  auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI);
+  if (LShrMI->getOpcode() != TargetOpcode::G_LSHR)
+    return false;
+
+  // Check the constant splat values
+  auto V1 = isConstantOrConstantSplatVector(
+      *MRI.getVRegDef(MI.getOperand(2).getReg()), MRI);
+  auto V2 = isConstantOrConstantSplatVector(
+      *MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI);
+  auto V3 = isConstantOrConstantSplatVector(
+      *MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI);
+  if (!V1.has_value() || !V2.has_value() || !V3.has_value())
+    return false;
+  unsigned HalfSize = DstTy.getScalarSizeInBits() / 2;
+  if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) ||
+      V3 != (HalfSize - 1))
+    return false;
+
+  SrcReg = LShrMI->getOperand(1).getReg();
+
+  return true;
+}
+
+void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &B, Register &SrcReg) {
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT HalfTy =
+      DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2))
+          .changeElementSize(DstTy.getScalarSizeInBits() / 2);
+
+  Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0);
+  Register CastReg =
+      B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0);
+  Register CMLTReg =
+      B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec)
+          .getReg(0);
+
+  B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0);
+  MI.eraseFromParent();
+}
+
 class AArch64PostLegalizerCombinerImpl : public Combiner {
 protected:
   // TODO: Make CombinerHelper methods const.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index a82d3cd095659..0f89fa557cd57 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -554,6 +554,57 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
   MI.eraseFromParent();
 }
 
+// Pushes ADD/SUB through extend instructions to decrease the number of extend
+// instruction at the end by allowing selection of {s|u}addl sooner
+
+// i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8))
+bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
+                        Register DstReg, Register SrcReg1, Register SrcReg2) {
+  assert(MI.getOpcode() == TargetOpcode::G_ADD ||
+         MI.getOpcode() == TargetOpcode::G_SUB &&
+             "Expected a G_ADD or G_SUB instruction\n");
+
+  // Deal with vector types only
+  LLT DstTy = MRI.getType(DstReg);
+  if (!DstTy.isVector())
+    return false;
+
+  // Return true if G_{S|Z}EXT instruction is more than 2* source
+  Register ExtDstReg = MI.getOperand(1).getReg();
+  LLT Ext1SrcTy = MRI.getType(SrcReg1);
+  LLT Ext2SrcTy = MRI.getType(SrcReg2);
+  unsigned ExtDstScal = MRI.getType(ExtDstReg).getScalarSizeInBits();
+  unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits();
+  if (((Ext1SrcScal == 8 && ExtDstScal == 32) ||
+       ((Ext1SrcScal == 8 || Ext1SrcScal == 16) && ExtDstScal == 64)) &&
+      Ext1SrcTy == Ext2SrcTy)
+    return true;
+
+  return false;
+}
+
+void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
+                        MachineIRBuilder &B, bool isSExt, Register DstReg,
+                        Register SrcReg1, Register SrcReg2) {
+  LLT SrcTy = MRI.getType(SrcReg1);
+  LLT MidTy = SrcTy.changeElementSize(SrcTy.getScalarSizeInBits() * 2);
+  unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+  Register Ext1Reg = B.buildInstr(Opc, {MidTy}, {SrcReg1}).getReg(0);
+  Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {SrcReg2}).getReg(0);
+  Register AddReg =
+      B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0);
+
+  // G_SUB has to sign-extend the result.
+  // G_ADD needs to sext from sext and can sext or zext from zext, so the
+  // original opcode is used.
+  if (MI.getOpcode() == TargetOpcode::G_ADD)
+    B.buildInstr(Opc, {DstReg}, {AddReg});
+  else
+    B.buildSExt(DstReg, AddReg);
+
+  MI.eraseFromParent();
+}
+
 bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
                         CombinerHelper &Helper, GISelChangeObserver &Observer) {
   // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 50ee37b0dfebc..b21b1faf5c962 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -104,6 +104,13 @@ class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, Re
   let usesCustomInserter = 1;
 }
 
+class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum za_flag>
+    : SMEPseudo2Instr<name, 0>,
+      Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, index_ty:$imm), []> {
+  let SMEMatrixType = za_flag;
+  let usesCustomInserter = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // SME pattern match helpers.
 //===----------------------------------------------------------------------===//
@@ -189,6 +196,9 @@ class SME2_Tile_VG4_Multi_Pat<string name, SDPatternOperator intrinsic, Operand
     : Pat<(intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
           (!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>;
 
+class SME2_Zero_Matrix_Pat<string name, SDPatternOperator intrinsic, Operand offset_ty, ComplexPattern tileslice>
+    : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset))),
+    (!cast<Instruction>(name) $base, $offset)>; 
 //===----------------------------------------------------------------------===//
 // SME pattern match helpers.
 //===----------------------------------------------------------------------===//
@@ -4815,39 +4825,57 @@ class sme2p1_zero_matrix<bits<6> opc, Operand index_ty, string mnemonic,
 }
 
 multiclass sme2p1_zero_matrix<string mnemonic> {
-  def _VG2_Z : sme2p1_zero_matrix<{0b000,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx2"> {
+  def _VG2_Z : sme2p1_zero_matrix<{0b000,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx2">, SMEPseudo2Instr<NAME # _VG2_Z , 1> {
     bits<3> imm;
     let Inst{2-0} = imm;
   }
-  def _2Z : sme2p1_zero_matrix<{0b001,?,?,?}, uimm3s2range, mnemonic> {
+  def _2Z : sme2p1_zero_matrix<{0b001,?,?,?}, uimm3s2range, mnemonic>, SMEPseudo2Instr<NAME # _2Z, 1> {
     bits<3> imm;
     let Inst{2-0} = imm;
   }
-  def _VG2_2Z : sme2p1_zero_matrix<{0b0100,?,?}, uimm2s2range, mnemonic, "vgx2"> {
+  def _VG2_2Z : sme2p1_zero_matrix<{0b0100,?,?}, uimm2s2range, mnemonic, "vgx2">, SMEPseudo2Instr<NAME # _VG2_2Z, 1> {
     bits<2> imm;
     let Inst{1-0} = imm;
   }
-  def _VG4_2Z : sme2p1_zero_matrix<{0b0110,?,?}, uimm2s2range, mnemonic, "vgx4"> {
+  def _VG4_2Z : sme2p1_zero_matrix<{0b0110,?,?}, uimm2s2range, mnemonic, "vgx4">, SMEPseudo2Instr<NAME # _VG4_2Z, 1> {
     bits<2> imm;
     let Inst{1-0} = imm;
   }
-  def _VG4_Z : sme2p1_zero_matrix<{0b100,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx4"> {
+  def _VG4_Z : sme2p1_zero_matrix<{0b100,?,?,?}, sme_elm_idx0_7, mnemonic, "vgx4">, SMEPseudo2Instr<NAME # _VG4_Z, 1> {
     bits<3> imm;
     let Inst{2-0} = imm;
   }
-  def _4Z : sme2p1_zero_matrix<{0b1010,?,?}, uimm2s4range, mnemonic> {
+  def _4Z : sme2p1_zero_matrix<{0b1010,?,?}, uimm2s4range, mnemonic>, SMEPseudo2Instr<NAME # _4Z, 1> {
     bits<2> imm;
     let Inst{1-0} = imm;
   }
-  def _VG2_4Z :sme2p1_zero_matrix<{0b11000,?}, uimm1s4range, mnemonic, "vgx2"> {
+  def _VG2_4Z : sme2p1_zero_matrix<{0b11000,?}, uimm1s4range, mnemonic, "vgx2">, SMEPseudo2Instr<NAME # _VG2_4Z, 1> {
     bits<1> imm;
     let Inst{0}   = imm;
   }
-  def _VG4_4Z :sme2p1_zero_matrix<{0b11100,?}, uimm1s4range, mnemonic, "vgx4"> {
+  def _VG4_4Z : sme2p1_zero_matrix<{0b11100,?}, uimm1s4range, mnemonic, "vgx4">, SMEPseudo2Instr<NAME # _VG4_4Z, 1> {
     bits<1> imm;
     let Inst{0}   = imm;
   }
-}
+
+  def NAME # _VG2_Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG2_Z, sme_elm_idx0_7, SMEMatrixArray>;
+  def NAME # _VG4_Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG4_Z, sme_elm_idx0_7, SMEMatrixArray>;
+  def NAME # _2Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _2Z, uimm2s2range, SMEMatrixArray>;
+  def NAME # _VG2_2Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG2_2Z, uimm1s2range, SMEMatrixArray>;
+  def NAME # _VG4_2Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG4_2Z, uimm1s2range, SMEMatrixArray>;
+  def NAME # _4Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _4Z, uimm1s4range, SMEMatrixArray>;
+  def NAME # _VG2_4Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG2_4Z, uimm0s4range, SMEMatrixArray>;
+  def NAME # _VG4_4Z_PSEUDO : sem2p1_zero_matrix_pseudo<NAME # _VG4_4Z, uimm0s4range, SMEMatrixArray>;
+
+  def : SME2_Zero_Matrix_Pat<NAME # _VG2_Z_PSEUDO, int_aarch64_sme_zero_za64_vg1x2, sme_elm_idx0_7, tileslice16>;
+  def : SME2_Zero_Matrix_Pat<NAME # _VG4_Z_PSEUDO, int_aarch64_sme_zero_za64_vg1x4, sme_elm_idx0_7, tileslice16>;
+  def : SME2_Zero_Matrix_Pat<NAME # _2Z_PSEUDO, int_aarch64_sme_zero_za64_vg2x1, uimm2s2range, tileslicerange2s2>;
+  def : SME2_Zero_Matrix_Pat<NAME # _VG2_2Z_PSEUDO, int_aarch64_sme_zero_za64_vg2x2, uimm1s2range, tileslicerange1s2>;
+  def : SME2_Zero_Matrix_Pat<NAME # _VG4_2Z_PSEUDO, int_aarch64_sme_zero_za64_vg2x4, uimm1s2range, tileslicerange1s2>;
+  def : SME2_Zero_Matrix_Pat<NAME # _4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x1, uimm1s4range, tileslicerange1s4>;
+  def : SME2_Zero_Matrix_Pat<NAME # _VG2_4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x2, uimm0s4range, tileslicerange0s4>;
+  def : SME2_Zero_Matrix_Pat<NAME # _VG4_4Z_PSEUDO, int_aarch64_sme_zero_za64_vg4x4, uimm0s4range, tileslicerange0s4>;
+} 
 
 //===----------------------------------------------------------------------===//
 // SME2.1 lookup table expand two non-contiguous registers
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 351263d079768..24f9a6e375baa 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -494,7 +494,7 @@ class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode,
 class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode,
                              RegisterClass DataRC, int num_addrs,
                              string dns="">
-  : VIMAGE_gfx12<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
+  : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns> {
   let InOperandList = !con(AddrIns,
                            (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim,
                                 CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe),
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index eae666ab0e7d7..97a8ff4486609 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -579,6 +579,7 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
         (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
       MRI.replaceRegWith(ScratchRsrcReg, Reg);
       MFI->setScratchRSrcReg(Reg);
+      MRI.reserveReg(Reg, TRI);
       return Reg;
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ddb5f71935685..4b5f9bdd82b8d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2083,6 +2083,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
 
+  assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
+         "unreserved scratch RSRC register");
+
   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
   int Index = MI->getOperand(FIOperandNum).getIndex();
 
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index ea8109bbee9ae..09dc1c781e2f3 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -278,11 +278,10 @@ LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
 
     for (const MachineInstr &MI : *BII.first) {
       auto III = Instructions.find(&MI);
-      if (III == Instructions.end())
-        continue;
-
-      dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
-             << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+      if (III != Instructions.end()) {
+        dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
+               << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+      }
     }
   }
 }
@@ -455,10 +454,8 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
       LiveRange &LR = LIS->getRegUnit(Unit);
       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
-      if (!Value)
-        continue;
-
-      markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
+      if (Value)
+        markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
     }
   }
 }
@@ -499,19 +496,16 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
 
       if (TII->isWQM(Opcode)) {
         // If LOD is not supported WQM is not needed.
-        if (!ST->hasExtendedImageInsts())
-          continue;
         // Only generate implicit WQM if implicit derivatives are required.
         // This avoids inserting unintended WQM if a shader type without
         // implicit derivatives uses an image sampling instruction.
-        if (!HasImplicitDerivatives)
-          continue;
-        // Sampling instructions don't need to produce results for all pixels
-        // in a quad, they just require all inputs of a quad to have been
-        // computed for derivatives.
-        markInstructionUses(MI, StateWQM, Worklist);
-        GlobalFlags |= StateWQM;
-        continue;
+        if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
+          // Sampling instructions don't need to produce results for all pixels
+          // in a quad, they just require all inputs of a quad to have been
+          // computed for derivatives.
+          markInstructionUses(MI, StateWQM, Worklist);
+          GlobalFlags |= StateWQM;
+        }
       } else if (Opcode == AMDGPU::WQM) {
         // The WQM intrinsic requires its output to have all the helper lanes
         // correct, so we need it to be in WQM.
@@ -520,7 +514,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
       } else if (Opcode == AMDGPU::SOFT_WQM) {
         LowerToCopyInstrs.push_back(&MI);
         SoftWQMInstrs.push_back(&MI);
-        continue;
       } else if (Opcode == AMDGPU::STRICT_WWM) {
         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
         // it needs to be executed in WQM or Exact so that its copy doesn't
@@ -528,7 +521,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         markInstructionUses(MI, StateStrictWWM, Worklist);
         GlobalFlags |= StateStrictWWM;
         LowerToMovInstrs.push_back(&MI);
-        continue;
       } else if (Opcode == AMDGPU::STRICT_WQM ||
                  TII->isDualSourceBlendEXP(MI)) {
         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
@@ -551,7 +543,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           GlobalFlags |= StateExact;
           III.Disabled = StateWQM | StateStrict;
         }
-        continue;
       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
                  Opcode == AMDGPU::DS_PARAM_LOAD ||
                  Opcode == AMDGPU::LDS_DIRECT_LOAD ||
@@ -561,7 +552,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         InstrInfo &II = Instructions[&MI];
         II.Needs |= StateStrictWQM;
         GlobalFlags |= StateStrictWQM;
-        continue;
       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
         III.Disabled = StateStrict;
@@ -574,7 +564,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           }
         }
         SetInactiveInstrs.push_back(&MI);
-        continue;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
@@ -583,40 +572,33 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         }
         GlobalFlags |= StateExact;
         III.Disabled = StateWQM | StateStrict;
-        continue;
-      } else {
-        if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
-          LiveMaskQueries.push_back(&MI);
-        } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
-                   Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
-                   Opcode == AMDGPU::SI_DEMOTE_I1) {
-          KillInstrs.push_back(&MI);
-          BBI.NeedsLowering = true;
-        } else if (WQMOutputs) {
-          // The function is in machine SSA form, which means that physical
-          // VGPRs correspond to shader inputs and outputs. Inputs are
-          // only used, outputs are only defined.
-          // FIXME: is this still valid?
-          for (const MachineOperand &MO : MI.defs()) {
-            if (!MO.isReg())
-              continue;
-
-            Register Reg = MO.getReg();
-
-            if (!Reg.isVirtual() &&
-                TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
-              Flags = StateWQM;
-              break;
-            }
+      } else if (Opcode == AMDGPU::SI_PS_LIVE ||
+                 Opcode == AMDGPU::SI_LIVE_MASK) {
+        LiveMaskQueries.push_back(&MI);
+      } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
+                 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
+                 Opcode == AMDGPU::SI_DEMOTE_I1) {
+        KillInstrs.push_back(&MI);
+        BBI.NeedsLowering = true;
+      } else if (WQMOutputs) {
+        // The function is in machine SSA form, which means that physical
+        // VGPRs correspond to shader inputs and outputs. Inputs are
+        // only used, outputs are only defined.
+        // FIXME: is this still valid?
+        for (const MachineOperand &MO : MI.defs()) {
+          Register Reg = MO.getReg();
+          if (Reg.isPhysical() &&
+              TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
+            Flags = StateWQM;
+            break;
           }
         }
-
-        if (!Flags)
-          continue;
       }
 
-      markInstruction(MI, Flags, Worklist);
-      GlobalFlags |= Flags;
+      if (Flags) {
+        markInstruction(MI, Flags, Worklist);
+        GlobalFlags |= Flags;
+      }
     }
   }
 
@@ -1568,8 +1550,6 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       SplitPoint = lowerKillF32(*MBB, *MI);
       break;
-    default:
-      continue;
     }
     if (SplitPoint)
       splitBlock(MBB, SplitPoint);
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index a3144109b7204..a46c383115e2d 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -251,6 +251,9 @@ namespace {
       SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI];
       if (Exclusive && Predicates.size() != 1)
         return false;
+      // We do not know how to convert an else predicate of a VCTP.
+      if (getVPTInstrPredicate(*MI) == ARMVCC::Else)
+        return false;
       return llvm::any_of(Predicates, isVCTP);
     }
 
@@ -305,8 +308,12 @@ namespace {
       // isn't predicated on entry, check whether the vctp is within the block
       // and that all other instructions are then predicated on it.
       for (auto &Block : Blocks) {
-        if (isEntryPredicatedOnVCTP(Block, false) ||
-            hasImplicitlyValidVPT(Block, RDA))
+        if (isEntryPredicatedOnVCTP(Block, false) &&
+            !any_of(drop_begin(Block.getInsts()), [](const MachineInstr *MI) {
+              return getVPTInstrPredicate(*MI) == ARMVCC::Else;
+            }))
+          continue;
+        if (hasImplicitlyValidVPT(Block, RDA))
           continue;
 
         SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index 639771ab9eabb..84ef582c029d3 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -296,8 +296,7 @@ def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units",
 def FeaturePrefixInstrs : SubtargetFeature<"prefix-instrs", "HasPrefixInstrs",
                                            "true",
                                            "Enable prefixed instructions",
-                                           [FeatureISA3_0, FeatureP8Vector,
-                                            FeatureP9Altivec]>;
+                                           [FeatureISA3_1]>;
 def FeaturePCRelativeMemops :
   SubtargetFeature<"pcrelative-memops", "HasPCRelativeMemops", "true",
                    "Enable PC relative Memory Ops",
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8450ce9e0e3b3..a0e91f4dc3a4a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9460,7 +9460,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // double. This is to exploit the XXSPLTIDP instruction.
   // If we lose precision, we use XXSPLTI32DX.
   if (BVNIsConstantSplat && (SplatBitSize == 64) &&
-      Subtarget.hasPrefixInstrs()) {
+      Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
     // Check the type first to short-circuit so we don't modify APSplatBits if
     // this block isn't executed.
     if ((Op->getValueType(0) == MVT::v2f64) &&
@@ -9605,11 +9605,11 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
   // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
   // turned into a 4-byte splat of 0xABABABAB.
-  if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
+  if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 2)
     return getCanonicalConstSplat(SplatBits | (SplatBits << 16), SplatSize * 2,
                                   Op.getValueType(), DAG, dl);
 
-  if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
+  if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 4)
     return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
                                   dl);
 
@@ -10242,7 +10242,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
   }
 
-  if (Subtarget.hasPrefixInstrs()) {
+  if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
     SDValue SplatInsertNode;
     if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))
       return SplatInsertNode;
@@ -17730,7 +17730,7 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     return false;
   case MVT::f32:
   case MVT::f64: {
-    if (Subtarget.hasPrefixInstrs()) {
+    if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
       // we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP.
       return true;
     }
@@ -18314,11 +18314,12 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
   // Compute subtarget flags.
   if (!Subtarget.hasP9Vector())
     FlagSet |= PPC::MOF_SubtargetBeforeP9;
-  else {
+  else
     FlagSet |= PPC::MOF_SubtargetP9;
-    if (Subtarget.hasPrefixInstrs())
-      FlagSet |= PPC::MOF_SubtargetP10;
-  }
+
+  if (Subtarget.hasPrefixInstrs())
+    FlagSet |= PPC::MOF_SubtargetP10;
+
   if (Subtarget.hasSPE())
     FlagSet |= PPC::MOF_SubtargetSPE;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 5f2937d47a519..2fd5978a23c80 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -654,13 +654,10 @@ let Predicates = [PrefixInstrs] in {
                                  (ins s34imm:$SI),
                                  "pli $RT, $SI", IIC_IntSimple, []>;
   }
+}
 
+let Predicates = [PrefixInstrs, HasFPU] in {
   let mayLoad = 1, mayStore = 0 in {
-    defm PLXV :
-      8LS_DForm_R_SI34_XT6_RA5_MEM_p<25, (outs vsrc:$XST), (ins (memri34 $D, $RA):$addr),
-                                     (ins (memri34_pcrel $D, $RA):$addr),
-                                     (ins s34imm_pcrel:$D),
-                                     "plxv $XST, $addr", "plxv $XST, $D", IIC_LdStLFD>;
     defm PLFS :
       MLS_DForm_R_SI34_RTA5_MEM_p<48, (outs f4rc:$RST), (ins (memri34 $D, $RA):$addr),
                                   (ins (memri34_pcrel $D, $RA):$addr),
@@ -671,6 +668,28 @@ let Predicates = [PrefixInstrs] in {
                                   (ins  (memri34_pcrel $D, $RA):$addr),
                                   (ins s34imm_pcrel:$D), "plfd $RST, $addr",
                                   "plfd $RST, $D", IIC_LdStLFD>;
+  }
+  let mayStore = 1, mayLoad = 0 in {
+    defm PSTFS :
+      MLS_DForm_R_SI34_RTA5_MEM_p<52, (outs), (ins f4rc:$RST, (memri34 $D, $RA):$addr),
+                                  (ins f4rc:$RST, (memri34_pcrel $D, $RA):$addr),
+                                  (ins f4rc:$RST, s34imm_pcrel:$D),
+                                  "pstfs $RST, $addr", "pstfs $RST, $D", IIC_LdStLFD>;
+    defm PSTFD :
+      MLS_DForm_R_SI34_RTA5_MEM_p<54, (outs), (ins f8rc:$RST, (memri34 $D, $RA):$addr),
+                                  (ins f8rc:$RST, (memri34_pcrel $D, $RA):$addr),
+                                  (ins f8rc:$RST, s34imm_pcrel:$D),
+                                  "pstfd $RST, $addr", "pstfd $RST, $D", IIC_LdStLFD>;
+  }
+}
+
+let Predicates = [PrefixInstrs, HasP10Vector] in {
+  let mayLoad = 1, mayStore = 0 in {
+    defm PLXV :
+      8LS_DForm_R_SI34_XT6_RA5_MEM_p<25, (outs vsrc:$XST), (ins (memri34 $D, $RA):$addr),
+                                     (ins (memri34_pcrel $D, $RA):$addr),
+                                     (ins s34imm_pcrel:$D),
+                                     "plxv $XST, $addr", "plxv $XST, $D", IIC_LdStLFD>;
     defm PLXSSP :
       8LS_DForm_R_SI34_RTA5_MEM_p<43, (outs vfrc:$RST), (ins (memri34 $D, $RA):$addr),
                                   (ins (memri34_pcrel $D, $RA):$addr),
@@ -683,6 +702,28 @@ let Predicates = [PrefixInstrs] in {
                                   (ins s34imm_pcrel:$D),
                                   "plxsd $RST, $addr", "plxsd $RST, $D",
                                   IIC_LdStLFD>;
+  }
+ let mayStore = 1, mayLoad = 0 in {
+    defm PSTXV :
+      8LS_DForm_R_SI34_XT6_RA5_MEM_p<27, (outs), (ins vsrc:$XST, (memri34 $D, $RA):$addr),
+                                     (ins vsrc:$XST, (memri34_pcrel $D, $RA):$addr),
+                                     (ins vsrc:$XST, s34imm_pcrel:$D),
+                                     "pstxv $XST, $addr", "pstxv $XST, $D", IIC_LdStLFD>;
+    defm PSTXSSP :
+      8LS_DForm_R_SI34_RTA5_MEM_p<47, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr),
+                                  (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr),
+                                  (ins vfrc:$RST, s34imm_pcrel:$D),
+                                  "pstxssp $RST, $addr", "pstxssp $RST, $D", IIC_LdStLFD>;
+    defm PSTXSD :
+      8LS_DForm_R_SI34_RTA5_MEM_p<46, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr),
+                                  (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr),
+                                  (ins vfrc:$RST, s34imm_pcrel:$D),
+                                  "pstxsd $RST, $addr", "pstxsd $RST, $D", IIC_LdStLFD>;
+  }
+}
+
+let Predicates = [PrefixInstrs] in {
+  let mayLoad = 1, mayStore = 0 in {
     let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
       defm PLBZ8 :
         MLS_DForm_R_SI34_RTA5_MEM_p<34, (outs g8rc:$RST), (ins (memri34 $D, $RA):$addr),
@@ -745,31 +786,6 @@ let Predicates = [PrefixInstrs] in {
   }
 
   let mayStore = 1, mayLoad = 0 in {
-    defm PSTXV :
-      8LS_DForm_R_SI34_XT6_RA5_MEM_p<27, (outs), (ins vsrc:$XST, (memri34 $D, $RA):$addr),
-                                     (ins vsrc:$XST, (memri34_pcrel $D, $RA):$addr),
-                                     (ins vsrc:$XST, s34imm_pcrel:$D),
-                                     "pstxv $XST, $addr", "pstxv $XST, $D", IIC_LdStLFD>;
-    defm PSTFS :
-      MLS_DForm_R_SI34_RTA5_MEM_p<52, (outs), (ins f4rc:$RST, (memri34 $D, $RA):$addr),
-                                  (ins f4rc:$RST, (memri34_pcrel $D, $RA):$addr),
-                                  (ins f4rc:$RST, s34imm_pcrel:$D),
-                                  "pstfs $RST, $addr", "pstfs $RST, $D", IIC_LdStLFD>;
-    defm PSTFD :
-      MLS_DForm_R_SI34_RTA5_MEM_p<54, (outs), (ins f8rc:$RST, (memri34 $D, $RA):$addr),
-                                  (ins f8rc:$RST, (memri34_pcrel $D, $RA):$addr),
-                                  (ins f8rc:$RST, s34imm_pcrel:$D),
-                                  "pstfd $RST, $addr", "pstfd $RST, $D", IIC_LdStLFD>;
-    defm PSTXSSP :
-      8LS_DForm_R_SI34_RTA5_MEM_p<47, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr),
-                                  (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr),
-                                  (ins vfrc:$RST, s34imm_pcrel:$D),
-                                  "pstxssp $RST, $addr", "pstxssp $RST, $D", IIC_LdStLFD>;
-    defm PSTXSD :
-      8LS_DForm_R_SI34_RTA5_MEM_p<46, (outs), (ins vfrc:$RST, (memri34 $D, $RA):$addr),
-                                  (ins vfrc:$RST, (memri34_pcrel $D, $RA):$addr),
-                                  (ins vfrc:$RST, s34imm_pcrel:$D),
-                                  "pstxsd $RST, $addr", "pstxsd $RST, $D", IIC_LdStLFD>;
     let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
       defm PSTB8 :
         MLS_DForm_R_SI34_RTA5_MEM_p<38, (outs), (ins g8rc:$RST, (memri34 $D, $RA):$addr),
@@ -1136,7 +1152,7 @@ let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops] in {
                                []>;
 }
 
-let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] in {
+let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in {
   defm PLXVP :
     8LS_DForm_R_XTp5_SI34_MEM_p<58, (outs vsrprc:$XTp), (ins (memri34 $D, $RA):$addr),
                                 (ins (memri34_pcrel $D, $RA):$addr),
@@ -1145,7 +1161,7 @@ let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] i
                                 IIC_LdStLFD>;
 }
 
-let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] in {
+let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in {
   defm PSTXVP :
     8LS_DForm_R_XTp5_SI34_MEM_p<62, (outs), (ins vsrprc:$XTp, (memri34 $D, $RA):$addr),
                                 (ins vsrprc:$XTp, (memri34_pcrel $D, $RA):$addr),
@@ -1157,7 +1173,7 @@ let Predicates = [PairedVectorMemops] in {
   // Intrinsics for Paired Vector Loads.
   def : Pat<(v256i1 (int_ppc_vsx_lxvp DQForm:$src)), (LXVP memrix16:$src)>;
   def : Pat<(v256i1 (int_ppc_vsx_lxvp XForm:$src)), (LXVPX XForm:$src)>;
-  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+  let Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in {
     def : Pat<(v256i1 (int_ppc_vsx_lxvp PDForm:$src)), (PLXVP memri34:$src)>;
   }
   // Intrinsics for Paired Vector Stores.
@@ -1165,7 +1181,7 @@ let Predicates = [PairedVectorMemops] in {
             (STXVP $XSp, memrix16:$dst)>;
   def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, XForm:$dst),
             (STXVPX $XSp, XForm:$dst)>;
-  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+  let Predicates = [PairedVectorMemops, PrefixInstrs, HasP10Vector] in {
     def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, PDForm:$dst),
               (PSTXVP $XSp, memri34:$dst)>;
   }
@@ -1236,6 +1252,9 @@ let Predicates = [PCRelativeMemops] in {
   def : Pat<(store i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTDpc $RS, $ga, 0)>;
 
+}
+
+let Predicates = [PCRelativeMemops, HasFPU] in {
   // Load f32
   def : Pat<(f32 (load (PPCmatpcreladdr PCRelForm:$addr))), (PLFSpc $addr, 0)>;
 
@@ -1252,6 +1271,11 @@ let Predicates = [PCRelativeMemops] in {
   def : Pat<(store f64:$FRS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTFDpc $FRS, $ga, 0)>;
 
+  def : Pat<(v4f32 (PPCldvsxlh (PPCmatpcreladdr PCRelForm:$addr))),
+            (SUBREG_TO_REG (i64 1), (PLFDpc $addr, 0), sub_64)>;
+}
+
+let Predicates = [PCRelativeMemops, HasP10Vector] in {
   // Load f128
   def : Pat<(f128 (load (PPCmatpcreladdr PCRelForm:$addr))),
             (COPY_TO_REGCLASS (PLXVpc $addr, 0), VRRC)>;
@@ -1288,6 +1312,14 @@ let Predicates = [PCRelativeMemops] in {
   def : Pat<(store v2f64:$XS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTXVpc $XS, $ga, 0)>;
 
+  // Special Cases For PPCstore_scal_int_from_vsr
+  def : Pat<(PPCstore_scal_int_from_vsr f64:$src, (PPCmatpcreladdr PCRelForm:$dst), 8),
+            (PSTXSDpc $src, $dst, 0)>;
+  def : Pat<(PPCstore_scal_int_from_vsr f128:$src, (PPCmatpcreladdr PCRelForm:$dst), 8),
+            (PSTXSDpc (COPY_TO_REGCLASS $src, VFRC), $dst, 0)>;
+}
+
+let Predicates = [PCRelativeMemops] in {
   // Atomic Load
   def : Pat<(i32 (atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZpc $ga, 0)>;
@@ -1314,15 +1346,6 @@ let Predicates = [PCRelativeMemops] in {
   def : Pat<(atomic_store_64 i64:$RS, (PPCmatpcreladdr PCRelForm:$ga)),
             (PSTDpc $RS, $ga, 0)>;
 
-  // Special Cases For PPCstore_scal_int_from_vsr
-  def : Pat<(PPCstore_scal_int_from_vsr f64:$src, (PPCmatpcreladdr PCRelForm:$dst), 8),
-            (PSTXSDpc $src, $dst, 0)>;
-  def : Pat<(PPCstore_scal_int_from_vsr f128:$src, (PPCmatpcreladdr PCRelForm:$dst), 8),
-            (PSTXSDpc (COPY_TO_REGCLASS $src, VFRC), $dst, 0)>;
-
-  def : Pat<(v4f32 (PPCldvsxlh (PPCmatpcreladdr PCRelForm:$addr))),
-            (SUBREG_TO_REG (i64 1), (PLFDpc $addr, 0), sub_64)>;
-
   // If the PPCmatpcreladdr node is not caught by any other pattern it should be
   // caught here and turned into a paddi instruction to materialize the address.
   def : Pat<(PPCmatpcreladdr PCRelForm:$addr), (PADDI8pc 0, $addr)>;
@@ -1335,7 +1358,7 @@ let Predicates = [PCRelativeMemops] in {
             (PADDI8 $in, $addr)>;
 }
 
-let Predicates = [PrefixInstrs] in {
+let Predicates = [PrefixInstrs, HasP10Vector] in {
   def XXPERMX :
     8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
                             vsrc:$XC, u3imm:$IMM),
@@ -2142,7 +2165,7 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
 class xxevalPattern <dag pattern, bits<8> imm> :
   Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
 
-let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
+let AddedComplexity = 400, Predicates = [PrefixInstrs, HasP10Vector] in {
  def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
                                 i32immNonAllOneNonZero:$A,
                                 i32immNonAllOneNonZero:$A,
@@ -2279,7 +2302,7 @@ def : Pat<(f64 nzFPImmAsi64:$A),
             (PSTXSD (COPY_TO_REGCLASS $src, VFRC), PDForm:$dst)>;
 }
 
-let Predicates = [PrefixInstrs] in {
+let Predicates = [PrefixInstrs, HasP10Vector] in {
   def : Pat<(i32 imm34:$imm), (PLI (getImmAs64BitInt imm:$imm))>;
   def : Pat<(i64 imm34:$imm), (PLI8 (getImmAs64BitInt imm:$imm))>;
   def : Pat<(v16i8 (int_ppc_vsx_xxpermx v16i8:$A, v16i8:$B, v16i8:$C, timm:$D)),
@@ -2300,7 +2323,9 @@ let Predicates = [PrefixInstrs] in {
             (XXBLENDVW $A, $B, $C)>;
   def : Pat<(int_ppc_vsx_xxblendvd v2i64:$A, v2i64:$B, v2i64:$C),
             (XXBLENDVD $A, $B, $C)>;
+}
 
+let Predicates = [PrefixInstrs] in {
   // Anonymous patterns to select prefixed loads and stores.
   // Load i32
   def : Pat<(i32 (extloadi1 PDForm:$src)), (PLBZ memri34:$src)>;
@@ -2335,7 +2360,9 @@ let Predicates = [PrefixInstrs] in {
   def : Pat<(truncstorei16 i64:$rS, PDForm:$dst), (PSTH8 g8rc:$rS, memri34:$dst)>;
   def : Pat<(truncstorei32 i64:$rS, PDForm:$dst), (PSTW8 g8rc:$rS, memri34:$dst)>;
   def : Pat<(store i64:$rS, PDForm:$dst), (PSTD g8rc:$rS, memri34:$dst)>;
+}
 
+let Predicates = [PrefixInstrs, HasFPU] in {
   // Load / Store f32
   def : Pat<(f32 (load PDForm:$src)), (PLFS memri34:$src)>;
   def : Pat<(store f32:$FRS, PDForm:$dst), (PSTFS $FRS, memri34:$dst)>;
@@ -2345,7 +2372,13 @@ let Predicates = [PrefixInstrs] in {
             (COPY_TO_REGCLASS (PLFS memri34:$src), VSFRC)>;
   def : Pat<(f64 (load PDForm:$src)), (PLFD memri34:$src)>;
   def : Pat<(store f64:$FRS, PDForm:$dst), (PSTFD $FRS, memri34:$dst)>;
+  // Prefixed fpext to v2f64
+  def : Pat<(v4f32 (PPCldvsxlh PDForm:$src)),
+            (SUBREG_TO_REG (i64 1), (PLFD PDForm:$src), sub_64)>;
 
+}
+
+let Predicates = [PrefixInstrs] in {
   // Atomic Load
   def : Pat<(i32 (atomic_load_8 PDForm:$src)), (PLBZ memri34:$src)>;
   def : Pat<(i32 (atomic_load_16 PDForm:$src)), (PLHZ memri34:$src)>;
@@ -2357,10 +2390,6 @@ let Predicates = [PrefixInstrs] in {
   def : Pat<(atomic_store_16 i32:$RS, PDForm:$dst), (PSTH $RS, memri34:$dst)>;
   def : Pat<(atomic_store_32 i32:$RS, PDForm:$dst), (PSTW $RS, memri34:$dst)>;
   def : Pat<(atomic_store_64 i64:$RS, PDForm:$dst), (PSTD $RS, memri34:$dst)>;
-
-  // Prefixed fpext to v2f64
-  def : Pat<(v4f32 (PPCldvsxlh PDForm:$src)),
-            (SUBREG_TO_REG (i64 1), (PLFD PDForm:$src), sub_64)>;
 }
 
 def InsertEltShift {
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 7e4cd6c72aa87..9e8da59615dfb 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1695,7 +1695,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // transform it to the prefixed version so we don't have to use the XForm.
   if ((OpC == PPC::LXVP || OpC == PPC::STXVP) &&
       (!isInt<16>(Offset) || (Offset % offsetMinAlign(MI)) != 0) &&
-      Subtarget.hasPrefixInstrs()) {
+      Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
     unsigned NewOpc = OpC == PPC::LXVP ? PPC::PLXVP : PPC::PSTXVP;
     MI.setDesc(TII.get(NewOpc));
     OpC = NewOpc;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f0e5a7d393b6c..e99c6208594e3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3125,7 +3125,7 @@ lowerVectorStrictFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG,
   Chain = Unorder.getValue(1);
   Src = DAG.getNode(RISCVISD::STRICT_FADD_VL, DL,
                     DAG.getVTList(ContainerVT, MVT::Other),
-                    {Chain, Src, Src, DAG.getUNDEF(ContainerVT), Unorder, VL});
+                    {Chain, Src, Src, Src, Unorder, VL});
   Chain = Src.getValue(1);
 
   // We do the conversion on the absolute value and fix the sign at the end.
@@ -13704,6 +13704,44 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Combine vXi32 (mul (and (lshr X, 15), 0x10001), 0xffff) ->
+// (bitcast (sra (v2Xi16 (bitcast X)), 15))
+// Same for other equivalent types with other equivalent constants.
+static SDValue combineVectorMulToSraBitcast(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Do this for legal vectors unless they are i1 or i8 vectors.
+  if (!VT.isVector() || !TLI.isTypeLegal(VT) || VT.getScalarSizeInBits() < 16)
+    return SDValue();
+
+  if (N->getOperand(0).getOpcode() != ISD::AND ||
+      N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
+    return SDValue();
+
+  SDValue And = N->getOperand(0);
+  SDValue Srl = And.getOperand(0);
+
+  APInt V1, V2, V3;
+  if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
+      !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
+      !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
+    return SDValue();
+
+  unsigned HalfSize = VT.getScalarSizeInBits() / 2;
+  if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
+      V3 != (HalfSize - 1))
+    return SDValue();
+
+  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
+                                EVT::getIntegerVT(*DAG.getContext(), HalfSize),
+                                VT.getVectorElementCount() * 2);
+  SDLoc DL(N);
+  SDValue Cast = DAG.getNode(ISD::BITCAST, DL, HalfVT, Srl.getOperand(0));
+  SDValue Sra = DAG.getNode(ISD::SRA, DL, HalfVT, Cast,
+                            DAG.getConstant(HalfSize - 1, DL, HalfVT));
+  return DAG.getNode(ISD::BITCAST, DL, VT, Sra);
+}
 
 static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
@@ -13748,6 +13786,9 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineBinOpOfZExt(N, DAG))
     return V;
 
+  if (SDValue V = combineVectorMulToSraBitcast(N, DAG))
+    return V;
+
   return SDValue();
 }
 
@@ -16087,6 +16128,57 @@ static bool matchIndexAsWiderOp(EVT VT, SDValue Index, SDValue Mask,
   return true;
 }
 
+static SDValue combineTruncOfSraSext(SDNode *N, SelectionDAG &DAG) {
+  // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1))
+  // This would be benefit for the cases where X and Y are both the same value
+  // type of low precision vectors. Since the truncate would be lowered into
+  // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate
+  // restriction, such pattern would be expanded into a series of "vsetvli"
+  // and "vnsrl" instructions later to reach this point.
+  auto IsTruncNode = [](SDValue V) {
+    if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
+      return false;
+    SDValue VL = V.getOperand(2);
+    auto *C = dyn_cast<ConstantSDNode>(VL);
+    // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand
+    bool IsVLMAXForVMSET = (C && C->isAllOnes()) ||
+                           (isa<RegisterSDNode>(VL) &&
+                            cast<RegisterSDNode>(VL)->getReg() == RISCV::X0);
+    return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL && IsVLMAXForVMSET;
+  };
+
+  SDValue Op = N->getOperand(0);
+
+  // We need to first find the inner level of TRUNCATE_VECTOR_VL node
+  // to distinguish such pattern.
+  while (IsTruncNode(Op)) {
+    if (!Op.hasOneUse())
+      return SDValue();
+    Op = Op.getOperand(0);
+  }
+
+  if (Op.getOpcode() != ISD::SRA || !Op.hasOneUse())
+    return SDValue();
+
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  if (N0.getOpcode() != ISD::SIGN_EXTEND || !N0.hasOneUse() ||
+      N1.getOpcode() != ISD::ZERO_EXTEND || !N1.hasOneUse())
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N10 = N1.getOperand(0);
+  if (!N00.getValueType().isVector() ||
+      N00.getValueType() != N10.getValueType() ||
+      N->getValueType(0) != N10.getValueType())
+    return SDValue();
+
+  unsigned MaxShAmt = N10.getValueType().getScalarSizeInBits() - 1;
+  SDValue SMin =
+      DAG.getNode(ISD::SMIN, SDLoc(N1), N->getValueType(0), N10,
+                  DAG.getConstant(MaxShAmt, SDLoc(N1), N->getValueType(0)));
+  return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin);
+}
 
 SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
@@ -16304,56 +16396,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
     return SDValue();
-  case RISCVISD::TRUNCATE_VECTOR_VL: {
-    // trunc (sra sext (X), zext (Y)) -> sra (X, smin (Y, scalarsize(Y) - 1))
-    // This would be benefit for the cases where X and Y are both the same value
-    // type of low precision vectors. Since the truncate would be lowered into
-    // n-levels TRUNCATE_VECTOR_VL to satisfy RVV's SEW*2->SEW truncate
-    // restriction, such pattern would be expanded into a series of "vsetvli"
-    // and "vnsrl" instructions later to reach this point.
-    auto IsTruncNode = [](SDValue V) {
-      if (V.getOpcode() != RISCVISD::TRUNCATE_VECTOR_VL)
-        return false;
-      SDValue VL = V.getOperand(2);
-      auto *C = dyn_cast<ConstantSDNode>(VL);
-      // Assume all TRUNCATE_VECTOR_VL nodes use VLMAX for VMSET_VL operand
-      bool IsVLMAXForVMSET = (C && C->isAllOnes()) ||
-                             (isa<RegisterSDNode>(VL) &&
-                              cast<RegisterSDNode>(VL)->getReg() == RISCV::X0);
-      return V.getOperand(1).getOpcode() == RISCVISD::VMSET_VL &&
-             IsVLMAXForVMSET;
-    };
-
-    SDValue Op = N->getOperand(0);
-
-    // We need to first find the inner level of TRUNCATE_VECTOR_VL node
-    // to distinguish such pattern.
-    while (IsTruncNode(Op)) {
-      if (!Op.hasOneUse())
-        return SDValue();
-      Op = Op.getOperand(0);
-    }
-
-    if (Op.getOpcode() == ISD::SRA && Op.hasOneUse()) {
-      SDValue N0 = Op.getOperand(0);
-      SDValue N1 = Op.getOperand(1);
-      if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
-          N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse()) {
-        SDValue N00 = N0.getOperand(0);
-        SDValue N10 = N1.getOperand(0);
-        if (N00.getValueType().isVector() &&
-            N00.getValueType() == N10.getValueType() &&
-            N->getValueType(0) == N10.getValueType()) {
-          unsigned MaxShAmt = N10.getValueType().getScalarSizeInBits() - 1;
-          SDValue SMin = DAG.getNode(
-              ISD::SMIN, SDLoc(N1), N->getValueType(0), N10,
-              DAG.getConstant(MaxShAmt, SDLoc(N1), N->getValueType(0)));
-          return DAG.getNode(ISD::SRA, SDLoc(N), N->getValueType(0), N00, SMin);
-        }
-      }
-    }
-    break;
-  }
+  case RISCVISD::TRUNCATE_VECTOR_VL:
+    return combineTruncOfSraSext(N, DAG);
   case ISD::TRUNCATE:
     return performTRUNCATECombine(N, DAG, Subtarget);
   case ISD::SELECT:
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index c0b2a695b8ea4..2c0a807e44685 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -882,7 +882,7 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
   StringRef getPassName() const override { return RISCV_INSERT_VSETVLI_NAME; }
 
 private:
-  bool needVSETVLI(const MachineInstr &MI, const VSETVLIInfo &Require,
+  bool needVSETVLI(const DemandedFields &Used, const VSETVLIInfo &Require,
                    const VSETVLIInfo &CurInfo) const;
   bool needVSETVLIPHI(const VSETVLIInfo &Require,
                       const MachineBasicBlock &MBB) const;
@@ -1175,17 +1175,13 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
 }
 
 /// Return true if a VSETVLI is required to transition from CurInfo to Require
-/// before MI.
-bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
+/// given a set of DemandedFields \p Used.
+bool RISCVInsertVSETVLI::needVSETVLI(const DemandedFields &Used,
                                      const VSETVLIInfo &Require,
                                      const VSETVLIInfo &CurInfo) const {
-  assert(Require == computeInfoForInstr(MI, MI.getDesc().TSFlags, *ST, LIS));
-
   if (!CurInfo.isValid() || CurInfo.isUnknown() || CurInfo.hasSEWLMULRatioOnly())
     return true;
 
-  DemandedFields Used = getDemanded(MI, ST);
-
   if (CurInfo.isCompatible(Used, Require, LIS))
     return false;
 
@@ -1232,16 +1228,17 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
   if (!RISCVII::hasSEWOp(TSFlags))
     return;
 
+  DemandedFields Demanded = getDemanded(MI, ST);
+
   const VSETVLIInfo NewInfo = computeInfoForInstr(MI, TSFlags, *ST, LIS);
   assert(NewInfo.isValid() && !NewInfo.isUnknown());
-  if (Info.isValid() && !needVSETVLI(MI, NewInfo, Info))
+  if (Info.isValid() && !needVSETVLI(Demanded, NewInfo, Info))
     return;
 
   const VSETVLIInfo PrevInfo = Info;
   if (!Info.isValid() || Info.isUnknown())
     Info = NewInfo;
 
-  DemandedFields Demanded = getDemanded(MI, ST);
   const VSETVLIInfo IncomingInfo = adjustIncoming(PrevInfo, NewInfo, Demanded);
 
   // If MI only demands that VL has the same zeroness, we only need to set the
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index ce50fe6e2cbb0..a1b078910e29c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1311,6 +1311,26 @@ def : Pat<(FrameAddrRegImm (iPTR GPR:$rs1), simm12:$imm12),
 
 /// HI and ADD_LO address nodes.
 
+// Pseudo for a rematerializable LUI+ADDI sequence for loading an address.
+// It will be expanded after register allocation.
+// FIXME: The scheduling information does not reflect the multiple instructions.
+let Size = 8, isReMaterializable = 1 in
+def PseudoMovAddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>,
+                    Sched<[WriteIALU]>;
+
+def riscv_hi_oneuse : unop_oneuse<riscv_hi>;
+def addr_hi_lo : PatFrag<(ops node:$hi, node:$lo),
+                         (riscv_add_lo (riscv_hi_oneuse node:$hi), node:$lo)>;
+
+def : Pat<(addr_hi_lo tglobaladdr:$hi, tglobaladdr:$lo),
+          (PseudoMovAddr tglobaladdr:$hi, tglobaladdr:$lo)>;
+def : Pat<(addr_hi_lo tblockaddress:$hi, tblockaddress:$lo),
+          (PseudoMovAddr tblockaddress:$hi, tblockaddress:$lo)>;
+def : Pat<(addr_hi_lo tjumptable:$hi, tjumptable:$lo),
+          (PseudoMovAddr tjumptable:$hi, tjumptable:$lo)>;
+def : Pat<(addr_hi_lo tconstpool:$hi, tconstpool:$lo),
+          (PseudoMovAddr tconstpool:$hi, tconstpool:$lo)>;
+
 def : Pat<(riscv_hi tglobaladdr:$in), (LUI tglobaladdr:$in)>;
 def : Pat<(riscv_hi tblockaddress:$in), (LUI tblockaddress:$in)>;
 def : Pat<(riscv_hi tjumptable:$in), (LUI tjumptable:$in)>;
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 410989177a8b9..fecc83a821f42 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -84,7 +84,8 @@ INITIALIZE_PASS(RISCVMergeBaseOffsetOpt, DEBUG_TYPE,
 //    3) The offset value in the Global Address or Constant Pool is 0.
 bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi,
                                              MachineInstr *&Lo) {
-  if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC)
+  if (Hi.getOpcode() != RISCV::LUI && Hi.getOpcode() != RISCV::AUIPC &&
+      Hi.getOpcode() != RISCV::PseudoMovAddr)
     return false;
 
   const MachineOperand &HiOp1 = Hi.getOperand(1);
@@ -97,16 +98,22 @@ bool RISCVMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi,
       HiOp1.getOffset() != 0)
     return false;
 
-  Register HiDestReg = Hi.getOperand(0).getReg();
-  if (!MRI->hasOneUse(HiDestReg))
-    return false;
+  if (Hi.getOpcode() == RISCV::PseudoMovAddr) {
+    // Most of the code should handle it correctly without modification by
+    // setting Lo and Hi both point to PseudoMovAddr
+    Lo = &Hi;
+  } else {
+    Register HiDestReg = Hi.getOperand(0).getReg();
+    if (!MRI->hasOneUse(HiDestReg))
+      return false;
 
-  Lo = &*MRI->use_instr_begin(HiDestReg);
-  if (Lo->getOpcode() != RISCV::ADDI)
-    return false;
+    Lo = &*MRI->use_instr_begin(HiDestReg);
+    if (Lo->getOpcode() != RISCV::ADDI)
+      return false;
+  }
 
   const MachineOperand &LoOp2 = Lo->getOperand(2);
-  if (Hi.getOpcode() == RISCV::LUI) {
+  if (Hi.getOpcode() == RISCV::LUI || Hi.getOpcode() == RISCV::PseudoMovAddr) {
     if (LoOp2.getTargetFlags() != RISCVII::MO_LO ||
         !(LoOp2.isGlobal() || LoOp2.isCPI() || LoOp2.isBlockAddress()) ||
         LoOp2.getOffset() != 0)
@@ -466,6 +473,13 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
 
   Hi.getOperand(1).setOffset(NewOffset);
   MachineOperand &ImmOp = Lo.getOperand(2);
+  // Expand PseudoMovAddr into LUI
+  if (Hi.getOpcode() == RISCV::PseudoMovAddr) {
+    auto *TII = ST->getInstrInfo();
+    Hi.setDesc(TII->get(RISCV::LUI));
+    Hi.removeOperand(2);
+  }
+
   if (Hi.getOpcode() != RISCV::AUIPC)
     ImmOp.setOffset(NewOffset);
 
@@ -501,6 +515,11 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
     }
   }
 
+  // Prevent Lo (originally PseudoMovAddr, which is also pointed by Hi) from
+  // being erased
+  if (&Lo == &Hi)
+    return true;
+
   MRI->replaceRegWith(Lo.getOperand(0).getReg(), Hi.getOperand(0).getReg());
   Lo.eraseFromParent();
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
index 52f2ce27164d6..b7b0c47c084c6 100644
--- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
@@ -44,6 +44,7 @@ class RISCVPostRAExpandPseudo : public MachineFunctionPass {
   bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandMovImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMovAddr(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
 };
 
 char RISCVPostRAExpandPseudo::ID = 0;
@@ -75,6 +76,8 @@ bool RISCVPostRAExpandPseudo::expandMI(MachineBasicBlock &MBB,
   switch (MBBI->getOpcode()) {
   case RISCV::PseudoMovImm:
     return expandMovImm(MBB, MBBI);
+  case RISCV::PseudoMovAddr:
+    return expandMovAddr(MBB, MBBI);
   default:
     return false;
   }
@@ -101,6 +104,26 @@ bool RISCVPostRAExpandPseudo::expandMovImm(MachineBasicBlock &MBB,
   return true;
 }
 
+bool RISCVPostRAExpandPseudo::expandMovAddr(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI) {
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  Register DstReg = MBBI->getOperand(0).getReg();
+  bool DstIsDead = MBBI->getOperand(0).isDead();
+  bool Renamable = MBBI->getOperand(0).isRenamable();
+
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::LUI))
+      .addReg(DstReg, RegState::Define | getRenamableRegState(Renamable))
+      .add(MBBI->getOperand(1));
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI))
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead) |
+                          getRenamableRegState(Renamable))
+      .addReg(DstReg, RegState::Kill | getRenamableRegState(Renamable))
+      .add(MBBI->getOperand(2));
+  MBBI->eraseFromParent();
+  return true;
+}
+
 } // end of anonymous namespace
 
 INITIALIZE_PASS(RISCVPostRAExpandPseudo, "riscv-expand-pseudolisimm32",
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 5c286acdcc9b3..ff8759755e517 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -272,6 +272,13 @@ void SPIRVInstPrinter::printOpDecorate(const MCInst *MI, raw_ostream &O) {
     case Decoration::UserSemantic:
       printStringImm(MI, NumFixedOps, O);
       break;
+    case Decoration::HostAccessINTEL:
+      printOperand(MI, NumFixedOps, O);
+      if (NumFixedOps + 1 < MI->getNumOperands()) {
+        O << ' ';
+        printStringImm(MI, NumFixedOps + 1, O);
+      }
+      break;
     default:
       printRemainingVariableOps(MI, NumFixedOps, O, true);
       break;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 7f531542544ab..75aa1823b11f2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -30,6 +30,13 @@ static const std::map<std::string, SPIRV::Extension::Extension>
          SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_min_max},
         {"SPV_INTEL_arbitrary_precision_integers",
          SPIRV::Extension::Extension::SPV_INTEL_arbitrary_precision_integers},
+        {"SPV_INTEL_cache_controls",
+         SPIRV::Extension::Extension::SPV_INTEL_cache_controls},
+        {"SPV_INTEL_global_variable_fpga_decorations",
+         SPIRV::Extension::Extension::
+             SPV_INTEL_global_variable_fpga_decorations},
+        {"SPV_INTEL_global_variable_host_access",
+         SPIRV::Extension::Extension::SPV_INTEL_global_variable_host_access},
         {"SPV_INTEL_optnone", SPIRV::Extension::Extension::SPV_INTEL_optnone},
         {"SPV_INTEL_usm_storage_classes",
          SPIRV::Extension::Extension::SPV_INTEL_usm_storage_classes},
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index ea53fe55e7ab5..ffbd1e17bad5e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -181,6 +181,14 @@ static void setInsertPointSkippingPhis(IRBuilder<> &B, Instruction *I) {
     B.SetInsertPoint(I);
 }
 
+static void setInsertPointAfterDef(IRBuilder<> &B, Instruction *I) {
+  B.SetCurrentDebugLocation(I->getDebugLoc());
+  if (I->getType()->isVoidTy())
+    B.SetInsertPoint(I->getNextNode());
+  else
+    B.SetInsertPoint(*I->getInsertionPointAfterDef());
+}
+
 static bool requireAssignType(Instruction *I) {
   IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I);
   if (Intr) {
@@ -560,6 +568,7 @@ void SPIRVEmitIntrinsics::preprocessUndefs(IRBuilder<> &B) {
 
   while (!Worklist.empty()) {
     Instruction *I = Worklist.front();
+    bool BPrepared = false;
     Worklist.pop();
 
     for (auto &Op : I->operands()) {
@@ -567,7 +576,10 @@ void SPIRVEmitIntrinsics::preprocessUndefs(IRBuilder<> &B) {
       if (!AggrUndef || !Op->getType()->isAggregateType())
         continue;
 
-      B.SetInsertPoint(I);
+      if (!BPrepared) {
+        setInsertPointSkippingPhis(B, I);
+        BPrepared = true;
+      }
       auto *IntrUndef = B.CreateIntrinsic(Intrinsic::spv_undef, {}, {});
       Worklist.push(IntrUndef);
       I->replaceUsesOfWith(Op, IntrUndef);
@@ -584,6 +596,7 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
 
   while (!Worklist.empty()) {
     auto *I = Worklist.front();
+    bool IsPhi = isa<PHINode>(I), BPrepared = false;
     assert(I);
     bool KeepInst = false;
     for (const auto &Op : I->operands()) {
@@ -615,7 +628,11 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
         else
           for (auto &COp : AggrConst->operands())
             Args.push_back(COp);
-        B.SetInsertPoint(I);
+        if (!BPrepared) {
+          IsPhi ? B.SetInsertPointPastAllocas(I->getParent()->getParent())
+                : B.SetInsertPoint(I);
+          BPrepared = true;
+        }
         auto *CI =
             B.CreateIntrinsic(Intrinsic::spv_const_composite, {ResTy}, {Args});
         Worklist.push(CI);
@@ -1111,8 +1128,7 @@ void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
       isa<BitCastInst>(I))
     return;
 
-  setInsertPointSkippingPhis(B, I->getNextNode());
-
+  setInsertPointAfterDef(B, I);
   Type *ElemTy = deduceElementType(I);
   Constant *EltTyConst = UndefValue::get(ElemTy);
   unsigned AddressSpace = getPointerAddressSpace(I->getType());
@@ -1127,7 +1143,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
   reportFatalOnTokenType(I);
   Type *Ty = I->getType();
   if (!Ty->isVoidTy() && !isPointerTy(Ty) && requireAssignType(I)) {
-    setInsertPointSkippingPhis(B, I->getNextNode());
+    setInsertPointAfterDef(B, I);
     Type *TypeToAssign = Ty;
     if (auto *II = dyn_cast<IntrinsicInst>(I)) {
       if (II->getIntrinsicID() == Intrinsic::spv_const_composite ||
@@ -1149,7 +1165,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
       if (isa<UndefValue>(Op) && Op->getType()->isAggregateType())
         buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op,
                         UndefValue::get(B.getInt32Ty()), {}, B);
-      else if (!isa<Instruction>(Op)) // TODO: This case could be removed
+      else if (!isa<Instruction>(Op))
         buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op, {},
                         B);
     }
@@ -1159,7 +1175,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
 void SPIRVEmitIntrinsics::insertSpirvDecorations(Instruction *I,
                                                  IRBuilder<> &B) {
   if (MDNode *MD = I->getMetadata("spirv.Decorations")) {
-    B.SetInsertPoint(I->getNextNode());
+    setInsertPointAfterDef(B, I);
     B.CreateIntrinsic(Intrinsic::spv_assign_decoration, {I->getType()},
                       {I, MetadataAsValue::get(I->getContext(), MD)});
   }
@@ -1170,7 +1186,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
   auto *II = dyn_cast<IntrinsicInst>(I);
   if (II && II->getIntrinsicID() == Intrinsic::spv_const_composite &&
       TrackConstants) {
-    B.SetInsertPoint(I->getNextNode());
+    setInsertPointAfterDef(B, I);
     auto t = AggrConsts.find(I);
     assert(t != AggrConsts.end());
     auto *NewOp =
@@ -1179,6 +1195,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
     I->replaceAllUsesWith(NewOp);
     NewOp->setArgOperand(0, I);
   }
+  bool IsPhi = isa<PHINode>(I), BPrepared = false;
   for (const auto &Op : I->operands()) {
     if ((isa<ConstantAggregateZero>(Op) && Op->getType()->isVectorTy()) ||
         isa<PHINode>(I) || isa<SwitchInst>(I))
@@ -1188,11 +1205,14 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
       if (II && ((II->getIntrinsicID() == Intrinsic::spv_gep && OpNo == 0) ||
                  (II->paramHasAttr(OpNo, Attribute::ImmArg))))
         continue;
-      B.SetInsertPoint(I);
+      if (!BPrepared) {
+        IsPhi ? B.SetInsertPointPastAllocas(I->getParent()->getParent())
+              : B.SetInsertPoint(I);
+        BPrepared = true;
+      }
       Value *OpTyVal = Op;
       if (Op->getType()->isTargetExtTy())
-        OpTyVal = Constant::getNullValue(
-            IntegerType::get(I->getContext(), GR->getPointerSize()));
+        OpTyVal = PoisonValue::get(Op->getType());
       auto *NewOp = buildIntrWithMD(Intrinsic::spv_track_constant,
                                     {Op->getType(), OpTyVal->getType()}, Op,
                                     OpTyVal, {}, B);
@@ -1201,7 +1221,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
   }
   if (I->hasName()) {
     reportFatalOnTokenType(I);
-    setInsertPointSkippingPhis(B, I->getNextNode());
+    setInsertPointAfterDef(B, I);
     std::vector<Value *> Args = {I};
     addStringImm(I->getName(), B, Args);
     B.CreateIntrinsic(Intrinsic::spv_assign_name, {I->getType()}, Args);
@@ -1345,7 +1365,7 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
   for (auto *I : Worklist) {
     TrackConstants = true;
     if (!I->getType()->isVoidTy() || isa<StoreInst>(I))
-      B.SetInsertPoint(I->getNextNode());
+      setInsertPointAfterDef(B, I);
     // Visitors return either the original/newly created instruction for further
     // processing, nullptr otherwise.
     I = visit(*I);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index c86ab285f354f..61f99f8d85269 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -703,6 +703,15 @@ static void addOpDecorateReqs(const MachineInstr &MI, unsigned DecIndex,
         static_cast<SPIRV::LinkageType::LinkageType>(LinkageOp);
     if (LnkType == SPIRV::LinkageType::LinkOnceODR)
       Reqs.addExtension(SPIRV::Extension::SPV_KHR_linkonce_odr);
+  } else if (Dec == SPIRV::Decoration::CacheControlLoadINTEL ||
+             Dec == SPIRV::Decoration::CacheControlStoreINTEL) {
+    Reqs.addExtension(SPIRV::Extension::SPV_INTEL_cache_controls);
+  } else if (Dec == SPIRV::Decoration::HostAccessINTEL) {
+    Reqs.addExtension(SPIRV::Extension::SPV_INTEL_global_variable_host_access);
+  } else if (Dec == SPIRV::Decoration::InitModeINTEL ||
+             Dec == SPIRV::Decoration::ImplementInRegisterMapINTEL) {
+    Reqs.addExtension(
+        SPIRV::Extension::SPV_INTEL_global_variable_fpga_decorations);
   }
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 85299a49a6b94..624899600693a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -40,6 +40,7 @@ class SPIRVPreLegalizer : public MachineFunctionPass {
 
 static void
 addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                    const SPIRVSubtarget &STI,
                     DenseMap<MachineInstr *, Type *> &TargetExtConstTypes) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT;
@@ -82,8 +83,17 @@ addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
           if (Const->getType()->isTargetExtTy()) {
             // remember association so that we can restore it when assign types
             MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
-            if (SrcMI && SrcMI->getOpcode() == TargetOpcode::G_CONSTANT)
+            if (SrcMI && (SrcMI->getOpcode() == TargetOpcode::G_CONSTANT ||
+                          SrcMI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF))
               TargetExtConstTypes[SrcMI] = Const->getType();
+            if (Const->isNullValue()) {
+              MachineIRBuilder MIB(MF);
+              SPIRVType *ExtType =
+                  GR->getOrCreateSPIRVType(Const->getType(), MIB);
+              SrcMI->setDesc(STI.getInstrInfo()->get(SPIRV::OpConstantNull));
+              SrcMI->addOperand(MachineOperand::CreateReg(
+                  GR->getSPIRVTypeID(ExtType), false));
+            }
           }
         } else {
           RegsAlreadyAddedToDT[&MI] = Reg;
@@ -394,6 +404,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
     for (auto MII = std::prev(MBB->end()), Begin = MBB->begin();
          !ReachedBegin;) {
       MachineInstr &MI = *MII;
+      unsigned MIOp = MI.getOpcode();
 
       if (isSpvIntrinsic(MI, Intrinsic::spv_assign_ptr_type)) {
         Register Reg = MI.getOperand(1).getReg();
@@ -419,9 +430,9 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
         if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE)
           insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MF.getRegInfo());
         ToErase.push_back(&MI);
-      } else if (MI.getOpcode() == TargetOpcode::G_CONSTANT ||
-                 MI.getOpcode() == TargetOpcode::G_FCONSTANT ||
-                 MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR) {
+      } else if (MIOp == TargetOpcode::G_CONSTANT ||
+                 MIOp == TargetOpcode::G_FCONSTANT ||
+                 MIOp == TargetOpcode::G_BUILD_VECTOR) {
         // %rc = G_CONSTANT ty Val
         // ===>
         // %cty = OpType* ty
@@ -435,15 +446,15 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
             continue;
         }
         Type *Ty = nullptr;
-        if (MI.getOpcode() == TargetOpcode::G_CONSTANT) {
+        if (MIOp == TargetOpcode::G_CONSTANT) {
           auto TargetExtIt = TargetExtConstTypes.find(&MI);
           Ty = TargetExtIt == TargetExtConstTypes.end()
                    ? MI.getOperand(1).getCImm()->getType()
                    : TargetExtIt->second;
-        } else if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) {
+        } else if (MIOp == TargetOpcode::G_FCONSTANT) {
           Ty = MI.getOperand(1).getFPImm()->getType();
         } else {
-          assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+          assert(MIOp == TargetOpcode::G_BUILD_VECTOR);
           Type *ElemTy = nullptr;
           MachineInstr *ElemMI = MRI.getVRegDef(MI.getOperand(1).getReg());
           assert(ElemMI);
@@ -459,7 +470,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
           Ty = VectorType::get(ElemTy, NumElts, false);
         }
         insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
-      } else if (MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
+      } else if (MIOp == TargetOpcode::G_GLOBAL_VALUE) {
         propagateSPIRVType(&MI, GR, MRI, MIB);
       }
 
@@ -802,7 +813,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
   MachineIRBuilder MIB(MF);
   // a registry of target extension constants
   DenseMap<MachineInstr *, Type *> TargetExtConstTypes;
-  addConstantsToTrack(MF, GR, TargetExtConstTypes);
+  addConstantsToTrack(MF, GR, ST, TargetExtConstTypes);
   foldConstantsIntoIntrinsics(MF);
   insertBitcasts(MF, GR, MIB);
   generateAssignInstrs(MF, GR, MIB, TargetExtConstTypes);
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index a8a0577f60564..7bee87d7204ed 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -22,6 +22,7 @@
 #include "SPIRVSubtarget.h"
 #include "SPIRVTargetMachine.h"
 #include "SPIRVUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -29,6 +30,8 @@
 #include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include <charconv>
+#include <regex>
 
 using namespace llvm;
 
@@ -152,6 +155,132 @@ static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
   return true;
 }
 
+static std::string getAnnotation(Value *AnnoVal, Value *OptAnnoVal) {
+  if (auto *Ref = dyn_cast_or_null<GetElementPtrInst>(AnnoVal))
+    AnnoVal = Ref->getOperand(0);
+  if (auto *Ref = dyn_cast_or_null<BitCastInst>(OptAnnoVal))
+    OptAnnoVal = Ref->getOperand(0);
+
+  std::string Anno;
+  if (auto *C = dyn_cast_or_null<Constant>(AnnoVal)) {
+    StringRef Str;
+    if (getConstantStringInfo(C, Str))
+      Anno = Str;
+  }
+  // handle optional annotation parameter in a way that Khronos Translator do
+  // (collect integers wrapped in a struct)
+  if (auto *C = dyn_cast_or_null<Constant>(OptAnnoVal);
+      C && C->getNumOperands()) {
+    Value *MaybeStruct = C->getOperand(0);
+    if (auto *Struct = dyn_cast<ConstantStruct>(MaybeStruct)) {
+      for (unsigned I = 0, E = Struct->getNumOperands(); I != E; ++I) {
+        if (auto *CInt = dyn_cast<ConstantInt>(Struct->getOperand(I)))
+          Anno += (I == 0 ? ": " : ", ") +
+                  std::to_string(CInt->getType()->getIntegerBitWidth() == 1
+                                     ? CInt->getZExtValue()
+                                     : CInt->getSExtValue());
+      }
+    } else if (auto *Struct = dyn_cast<ConstantAggregateZero>(MaybeStruct)) {
+      // { i32 i32 ... } zeroinitializer
+      for (unsigned I = 0, E = Struct->getType()->getStructNumElements();
+           I != E; ++I)
+        Anno += I == 0 ? ": 0" : ", 0";
+    }
+  }
+  return Anno;
+}
+
+static SmallVector<Metadata *> parseAnnotation(Value *I,
+                                               const std::string &Anno,
+                                               LLVMContext &Ctx,
+                                               Type *Int32Ty) {
+  // Try to parse the annotation string according to the following rules:
+  // annotation := ({kind} | {kind:value,value,...})+
+  // kind := number
+  // value := number | string
+  static const std::regex R(
+      "\\{(\\d+)(?:[:,](\\d+|\"[^\"]*\")(?:,(\\d+|\"[^\"]*\"))*)?\\}");
+  SmallVector<Metadata *> MDs;
+  int Pos = 0;
+  for (std::sregex_iterator
+           It = std::sregex_iterator(Anno.begin(), Anno.end(), R),
+           ItEnd = std::sregex_iterator();
+       It != ItEnd; ++It) {
+    if (It->position() != Pos)
+      return SmallVector<Metadata *>{};
+    Pos = It->position() + It->length();
+    std::smatch Match = *It;
+    SmallVector<Metadata *> MDsItem;
+    for (std::size_t i = 1; i < Match.size(); ++i) {
+      std::ssub_match SMatch = Match[i];
+      std::string Item = SMatch.str();
+      if (Item.length() == 0)
+        break;
+      if (Item[0] == '"') {
+        Item = Item.substr(1, Item.length() - 2);
+        // Acceptable format of the string snippet is:
+        static const std::regex RStr("^(\\d+)(?:,(\\d+))*$");
+        if (std::smatch MatchStr; std::regex_match(Item, MatchStr, RStr)) {
+          for (std::size_t SubIdx = 1; SubIdx < MatchStr.size(); ++SubIdx)
+            if (std::string SubStr = MatchStr[SubIdx].str(); SubStr.length())
+              MDsItem.push_back(ConstantAsMetadata::get(
+                  ConstantInt::get(Int32Ty, std::stoi(SubStr))));
+        } else {
+          MDsItem.push_back(MDString::get(Ctx, Item));
+        }
+      } else if (int32_t Num;
+                 std::from_chars(Item.data(), Item.data() + Item.size(), Num)
+                     .ec == std::errc{}) {
+        MDsItem.push_back(
+            ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Num)));
+      } else {
+        MDsItem.push_back(MDString::get(Ctx, Item));
+      }
+    }
+    if (MDsItem.size() == 0)
+      return SmallVector<Metadata *>{};
+    MDs.push_back(MDNode::get(Ctx, MDsItem));
+  }
+  return Pos == static_cast<int>(Anno.length()) ? MDs
+                                                : SmallVector<Metadata *>{};
+}
+
+static void lowerPtrAnnotation(IntrinsicInst *II) {
+  LLVMContext &Ctx = II->getContext();
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+
+  // Retrieve an annotation string from arguments.
+  Value *PtrArg = nullptr;
+  if (auto *BI = dyn_cast<BitCastInst>(II->getArgOperand(0)))
+    PtrArg = BI->getOperand(0);
+  else
+    PtrArg = II->getOperand(0);
+  std::string Anno =
+      getAnnotation(II->getArgOperand(1),
+                    4 < II->arg_size() ? II->getArgOperand(4) : nullptr);
+
+  // Parse the annotation.
+  SmallVector<Metadata *> MDs = parseAnnotation(II, Anno, Ctx, Int32Ty);
+
+  // If the annotation string is not parsed successfully we don't know the
+  // format used and output it as a general UserSemantic decoration.
+  // Otherwise MDs is a Metadata tuple (a decoration list) in the format
+  // expected by `spirv.Decorations`.
+  if (MDs.size() == 0) {
+    auto UserSemantic = ConstantAsMetadata::get(ConstantInt::get(
+        Int32Ty, static_cast<uint32_t>(SPIRV::Decoration::UserSemantic)));
+    MDs.push_back(MDNode::get(Ctx, {UserSemantic, MDString::get(Ctx, Anno)}));
+  }
+
+  // Build the internal intrinsic function.
+  IRBuilder<> IRB(II->getParent());
+  IRB.SetInsertPoint(II);
+  IRB.CreateIntrinsic(
+      Intrinsic::spv_assign_decoration, {PtrArg->getType()},
+      {PtrArg, MetadataAsValue::get(Ctx, MDNode::get(Ctx, MDs))});
+  II->replaceAllUsesWith(II->getOperand(0));
+}
+
 static void lowerFunnelShifts(IntrinsicInst *FSHIntrinsic) {
   // Get a separate function - otherwise, we'd have to rework the CFG of the
   // current one. Then simply replace the intrinsic uses with a call to the new
@@ -334,6 +463,10 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
         Changed |= toSpvOverloadedIntrinsic(
             II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
         break;
+      case Intrinsic::ptr_annotation:
+        lowerPtrAnnotation(II);
+        Changed = true;
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 98cbd9d2c1f2e..65b48c8acf6ab 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -299,6 +299,9 @@ defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
 defm SPV_INTEL_variable_length_array : ExtensionOperand<105>;
 defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106>;
 defm SPV_INTEL_inline_assembly : ExtensionOperand<107>;
+defm SPV_INTEL_cache_controls : ExtensionOperand<108>;
+defm SPV_INTEL_global_variable_host_access : ExtensionOperand<109>;
+defm SPV_INTEL_global_variable_fpga_decorations : ExtensionOperand<110>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -471,6 +474,10 @@ defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variabl
 defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
 defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
 defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>;
+defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
+defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
+defm GlobalVariableFPGADecorationsINTEL : CapabilityOperand<6189, 0, 0, [SPV_INTEL_global_variable_fpga_decorations], []>;
+defm CacheControlsINTEL : CapabilityOperand<6441, 0, 0, [SPV_INTEL_cache_controls], []>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
@@ -1206,6 +1213,11 @@ defm ReferencedIndirectlyINTEL : DecorationOperand<5602, 0, 0, [], [IndirectRefe
 defm ClobberINTEL : DecorationOperand<5607, 0, 0, [SPV_INTEL_inline_assembly], [AsmINTEL]>;
 defm SideEffectsINTEL : DecorationOperand<5608, 0, 0, [SPV_INTEL_inline_assembly], [AsmINTEL]>;
 defm ArgumentAttributeINTEL : DecorationOperand<6409, 0, 0, [], [FunctionPointersINTEL]>;
+defm CacheControlLoadINTEL : DecorationOperand<6442, 0, 0, [], [CacheControlsINTEL]>;
+defm CacheControlStoreINTEL : DecorationOperand<6443, 0, 0, [], [CacheControlsINTEL]>;
+defm HostAccessINTEL : DecorationOperand<6188, 0, 0, [], [GlobalVariableHostAccessINTEL]>;
+defm InitModeINTEL : DecorationOperand<6190, 0, 0, [], [GlobalVariableFPGADecorationsINTEL]>;
+defm ImplementInRegisterMapINTEL : DecorationOperand<6191, 0, 0, [], [GlobalVariableFPGADecorationsINTEL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define BuiltIn enum values and at the same time
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index e8f58a19d25e3..71dfe1062956e 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -54,6 +54,13 @@ cl::opt<bool>
 // setjmp/longjmp handling using wasm EH instrutions
 cl::opt<bool> WebAssembly::WasmEnableSjLj(
     "wasm-enable-sjlj", cl::desc("WebAssembly setjmp/longjmp handling"));
+// Whether we use the new exnref Wasm EH proposal adopted on Oct 2023.
+// Should be used with -wasm-enable-eh.
+// Currently set to false by default, but will later change to true and then
+// later can be removed after the legacy WAsm EH instructions are removed.
+cl::opt<bool> WebAssembly::WasmEnableExnref(
+    "wasm-enable-exnref", cl::desc("WebAssembly exception handling (exnref)"),
+    cl::init(false));
 
 static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
                                   const Triple &TT,
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 34502170a5c71..7f1a5f616ed48 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -44,6 +44,7 @@ extern cl::opt<bool> WasmEnableEmEH;   // asm.js-style EH
 extern cl::opt<bool> WasmEnableEmSjLj; // asm.js-style SjLJ
 extern cl::opt<bool> WasmEnableEH;     // EH using Wasm EH instructions
 extern cl::opt<bool> WasmEnableSjLj;   // SjLj using Wasm EH instructions
+extern cl::opt<bool> WasmEnableExnref; // EH using new Wasm EH (exnref)
 
 enum OperandType {
   /// Basic block label in a branch construct.
@@ -355,6 +356,8 @@ inline bool isArgument(unsigned Opc) {
   case WebAssembly::ARGUMENT_funcref_S:
   case WebAssembly::ARGUMENT_externref:
   case WebAssembly::ARGUMENT_externref_S:
+  case WebAssembly::ARGUMENT_exnref:
+  case WebAssembly::ARGUMENT_exnref_S:
     return true;
   default:
     return false;
@@ -377,6 +380,8 @@ inline bool isCopy(unsigned Opc) {
   case WebAssembly::COPY_FUNCREF_S:
   case WebAssembly::COPY_EXTERNREF:
   case WebAssembly::COPY_EXTERNREF_S:
+  case WebAssembly::COPY_EXNREF:
+  case WebAssembly::COPY_EXNREF_S:
     return true;
   default:
     return false;
@@ -399,6 +404,8 @@ inline bool isTee(unsigned Opc) {
   case WebAssembly::TEE_FUNCREF_S:
   case WebAssembly::TEE_EXTERNREF:
   case WebAssembly::TEE_EXTERNREF_S:
+  case WebAssembly::TEE_EXNREF:
+  case WebAssembly::TEE_EXNREF_S:
     return true;
   default:
     return false;
@@ -489,6 +496,8 @@ inline bool isLocalGet(unsigned Opc) {
   case WebAssembly::LOCAL_GET_FUNCREF_S:
   case WebAssembly::LOCAL_GET_EXTERNREF:
   case WebAssembly::LOCAL_GET_EXTERNREF_S:
+  case WebAssembly::LOCAL_GET_EXNREF:
+  case WebAssembly::LOCAL_GET_EXNREF_S:
     return true;
   default:
     return false;
@@ -511,6 +520,8 @@ inline bool isLocalSet(unsigned Opc) {
   case WebAssembly::LOCAL_SET_FUNCREF_S:
   case WebAssembly::LOCAL_SET_EXTERNREF:
   case WebAssembly::LOCAL_SET_EXTERNREF_S:
+  case WebAssembly::LOCAL_SET_EXNREF:
+  case WebAssembly::LOCAL_SET_EXNREF_S:
     return true;
   default:
     return false;
@@ -533,6 +544,8 @@ inline bool isLocalTee(unsigned Opc) {
   case WebAssembly::LOCAL_TEE_FUNCREF_S:
   case WebAssembly::LOCAL_TEE_EXTERNREF:
   case WebAssembly::LOCAL_TEE_EXTERNREF_S:
+  case WebAssembly::LOCAL_TEE_EXNREF:
+  case WebAssembly::LOCAL_TEE_EXNREF_S:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
index 8ea02bd2ad1ff..d9c8e22bbbaf5 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
@@ -27,6 +27,7 @@ std::optional<wasm::ValType> WebAssembly::parseType(StringRef Type) {
              wasm::ValType::V128)
       .Case("funcref", wasm::ValType::FUNCREF)
       .Case("externref", wasm::ValType::EXTERNREF)
+      .Case("exnref", wasm::ValType::EXNREF)
       .Default(std::nullopt);
 }
 
@@ -40,6 +41,7 @@ WebAssembly::BlockType WebAssembly::parseBlockType(StringRef Type) {
       .Case("v128", WebAssembly::BlockType::V128)
       .Case("funcref", WebAssembly::BlockType::Funcref)
       .Case("externref", WebAssembly::BlockType::Externref)
+      .Case("exnref", WebAssembly::BlockType::Exnref)
       .Case("void", WebAssembly::BlockType::Void)
       .Default(WebAssembly::BlockType::Invalid);
 }
@@ -62,6 +64,8 @@ const char *WebAssembly::anyTypeToString(unsigned Type) {
     return "funcref";
   case wasm::WASM_TYPE_EXTERNREF:
     return "externref";
+  case wasm::WASM_TYPE_EXNREF:
+    return "exnref";
   case wasm::WASM_TYPE_FUNC:
     return "func";
   case wasm::WASM_TYPE_NORESULT:
@@ -110,6 +114,8 @@ wasm::ValType WebAssembly::regClassToValType(unsigned RC) {
     return wasm::ValType::FUNCREF;
   case WebAssembly::EXTERNREFRegClassID:
     return wasm::ValType::EXTERNREF;
+  case WebAssembly::EXNREFRegClassID:
+    return wasm::ValType::EXNREF;
   default:
     llvm_unreachable("unexpected type");
   }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
index 486cf264d13e2..063ee4dba9068 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h
@@ -32,6 +32,7 @@ enum class BlockType : unsigned {
   V128 = unsigned(wasm::ValType::V128),
   Externref = unsigned(wasm::ValType::EXTERNREF),
   Funcref = unsigned(wasm::ValType::FUNCREF),
+  Exnref = unsigned(wasm::ValType::EXNREF),
   // Multivalue blocks (and other non-void blocks) are only emitted when the
   // blocks will never be exited and are at the ends of functions (see
   // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made
@@ -41,7 +42,8 @@ enum class BlockType : unsigned {
 };
 
 inline bool isRefType(wasm::ValType Type) {
-  return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF;
+  return Type == wasm::ValType::EXTERNREF || Type == wasm::ValType::FUNCREF ||
+         Type == wasm::ValType::EXNREF;
 }
 
 // Convert ValType or a list/signature of ValTypes to a string.
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index 867953b4e8d71..f9293460e701a 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -33,6 +33,7 @@ MVT WebAssembly::parseMVT(StringRef Type) {
       .Case("v2i64", MVT::v2i64)
       .Case("funcref", MVT::funcref)
       .Case("externref", MVT::externref)
+      .Case("exnref", MVT::exnref)
       .Default(MVT::INVALID_SIMPLE_VALUE_TYPE);
 }
 
@@ -58,6 +59,8 @@ wasm::ValType WebAssembly::toValType(MVT Type) {
     return wasm::ValType::FUNCREF;
   case MVT::externref:
     return wasm::ValType::EXTERNREF;
+  case MVT::exnref:
+    return wasm::ValType::EXNREF;
   default:
     llvm_unreachable("unexpected type");
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 443558537da24..0b7ec6e74cab2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -125,6 +125,8 @@ static char getInvokeSig(wasm::ValType VT) {
     return 'F';
   case wasm::ValType::EXTERNREF:
     return 'X';
+  case wasm::ValType::EXNREF:
+    return 'E';
   default:
     llvm_unreachable("Unhandled wasm::ValType enum");
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 0159c44a79b76..3c6a29311a10e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -100,6 +100,8 @@ static unsigned getDropOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::DROP_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::DROP_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::DROP_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -119,6 +121,8 @@ static unsigned getLocalGetOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_GET_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::LOCAL_GET_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::LOCAL_GET_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -138,6 +142,8 @@ static unsigned getLocalSetOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_SET_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::LOCAL_SET_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::LOCAL_SET_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -157,6 +163,8 @@ static unsigned getLocalTeeOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::LOCAL_TEE_FUNCREF;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return WebAssembly::LOCAL_TEE_EXTERNREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::LOCAL_TEE_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -176,6 +184,8 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
     return MVT::funcref;
   if (RC == &WebAssembly::EXTERNREFRegClass)
     return MVT::externref;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return MVT::exnref;
   llvm_unreachable("unrecognized register class");
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 26e13948bc9a6..aa3aa1b007a53 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -137,6 +137,10 @@ class WebAssemblyFastISel final : public FastISel {
       if (Subtarget->hasReferenceTypes())
         return VT;
       break;
+    case MVT::exnref:
+      if (Subtarget->hasReferenceTypes() && Subtarget->hasExceptionHandling())
+        return VT;
+      break;
     case MVT::f16:
       return MVT::f32;
     case MVT::v16i8:
@@ -717,6 +721,10 @@ bool WebAssemblyFastISel::fastLowerArguments() {
       Opc = WebAssembly::ARGUMENT_externref;
       RC = &WebAssembly::EXTERNREFRegClass;
       break;
+    case MVT::exnref:
+      Opc = WebAssembly::ARGUMENT_exnref;
+      RC = &WebAssembly::EXNREFRegClass;
+      break;
     default:
       return false;
     }
@@ -821,6 +829,9 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     case MVT::externref:
       ResultReg = createResultReg(&WebAssembly::EXTERNREFRegClass);
       break;
+    case MVT::exnref:
+      ResultReg = createResultReg(&WebAssembly::EXNREFRegClass);
+      break;
     default:
       return false;
     }
@@ -948,6 +959,10 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
     Opc = WebAssembly::SELECT_EXTERNREF;
     RC = &WebAssembly::EXTERNREFRegClass;
     break;
+  case MVT::exnref:
+    Opc = WebAssembly::SELECT_EXNREF;
+    RC = &WebAssembly::EXNREFRegClass;
+    break;
   default:
     return false;
   }
@@ -1355,6 +1370,7 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
   case MVT::v2f64:
   case MVT::funcref:
   case MVT::externref:
+  case MVT::exnref:
     break;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 518b6932a0c87..4beab9d091581 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -76,6 +76,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   if (Subtarget->hasReferenceTypes()) {
     addRegisterClass(MVT::externref, &WebAssembly::EXTERNREFRegClass);
     addRegisterClass(MVT::funcref, &WebAssembly::FUNCREFRegClass);
+    if (Subtarget->hasExceptionHandling()) {
+      addRegisterClass(MVT::exnref, &WebAssembly::EXNREFRegClass);
+    }
   }
   // Compute derived properties from the register classes.
   computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -142,6 +145,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTruncStoreAction(T, MVT::f16, Expand);
   }
 
+  if (Subtarget->hasHalfPrecision()) {
+    setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
+  }
+
   // Expand unavailable integer operations.
   for (auto Op :
        {ISD::BSWAP, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::MULHS, ISD::MULHU,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index c1a5a45395e87..3d37eb2fa27bc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -292,6 +292,7 @@ defm "": ARGUMENT<F32, f32>;
 defm "": ARGUMENT<F64, f64>;
 defm "": ARGUMENT<FUNCREF, funcref>;
 defm "": ARGUMENT<EXTERNREF, externref>;
+defm "": ARGUMENT<EXNREF, exnref>;
 
 // local.get and local.set are not generated by instruction selection; they
 // are implied by virtual register uses and defs.
@@ -375,6 +376,8 @@ defm "" : LOCAL<F64, global_op32>;
 defm "" : LOCAL<V128, global_op32>, Requires<[HasSIMD128]>;
 defm "" : LOCAL<FUNCREF, global_op32>, Requires<[HasReferenceTypes]>;
 defm "" : LOCAL<EXTERNREF, global_op32>, Requires<[HasReferenceTypes]>;
+defm "" : LOCAL<EXNREF, global_op32>,
+          Requires<[HasReferenceTypes, HasExceptionHandling]>;
 
 let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
 defm CONST_I32 : I<(outs I32:$res), (ins i32imm_op:$imm),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index 608963d588635..2654a09387fd4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -17,8 +17,9 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
                         [(set rc:$dst, (!cast<Intrinsic>("int_wasm_ref_null_" # ht)))],
                         "ref.null_" # ht # "$dst",
                         "ref.null_" # ht,
-                        !cond(!eq(ht, "func")   : 0xd070, 
-                              !eq(ht, "extern") : 0xd06f)>,
+                        !cond(!eq(ht, "func")   : 0xd070,
+                              !eq(ht, "extern") : 0xd06f,
+                              !eq(ht, "exn")    : 0xd069)>,
                       Requires<[HasReferenceTypes]>;
   defm SELECT_#rc: I<(outs rc:$dst), (ins rc:$lhs, rc:$rhs, I32:$cond),
                      (outs), (ins),
@@ -37,8 +38,9 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
 
 defm "" : REF_I<FUNCREF, funcref, "func">;
 defm "" : REF_I<EXTERNREF, externref, "extern">;
+defm "" : REF_I<EXNREF, exnref, "exn">;
 
-foreach rc = [FUNCREF, EXTERNREF] in {
+foreach rc = [FUNCREF, EXTERNREF, EXNREF] in {
 def : Pat<(select (i32 (setne I32:$cond, 0)), rc:$lhs, rc:$rhs),
           (!cast<Instruction>("SELECT_"#rc) rc:$lhs, rc:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), rc:$lhs, rc:$rhs),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 558e3d859dcd8..baf15ccdbe9ed 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -16,33 +16,34 @@
 multiclass ABSTRACT_SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                            list<dag> pattern_r, string asmstr_r,
                            string asmstr_s, bits<32> simdop,
-                           Predicate simd_level> {
+                           list<Predicate> reqs> {
   defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
               !if(!ge(simdop, 0x100),
                   !or(0xfd0000, !and(0xffff, simdop)),
                   !or(0xfd00, !and(0xff, simdop)))>,
-            Requires<[simd_level]>;
+            Requires<reqs>;
 }
 
 multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                   list<dag> pattern_r, string asmstr_r = "",
-                  string asmstr_s = "", bits<32> simdop = -1> {
+                  string asmstr_s = "", bits<32> simdop = -1,
+                  list<Predicate> reqs = []> {
   defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
-                            asmstr_s, simdop, HasSIMD128>;
+                            asmstr_s, simdop, !listconcat([HasSIMD128], reqs)>;
 }
 
 multiclass RELAXED_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                      list<dag> pattern_r, string asmstr_r = "",
                      string asmstr_s = "", bits<32> simdop = -1> {
   defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
-                            asmstr_s, simdop, HasRelaxedSIMD>;
+                            asmstr_s, simdop, [HasRelaxedSIMD]>;
 }
 
 multiclass HALF_PRECISION_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                             list<dag> pattern_r, string asmstr_r = "",
                             string asmstr_s = "", bits<32> simdop = -1> {
   defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
-                            asmstr_s, simdop, HasHalfPrecision>;
+                            asmstr_s, simdop, [HasHalfPrecision]>;
 }
 
 
@@ -152,6 +153,19 @@ def F64x2 : Vec {
   let prefix = "f64x2";
 }
 
+def F16x8 : Vec {
+ let vt = v8f16;
+ let int_vt = v8i16;
+ let lane_vt = f32;
+ let lane_rc = F32;
+ let lane_bits = 16;
+ let lane_idx = LaneIdx8;
+ let lane_load = int_wasm_loadf16_f32;
+ let splat = PatFrag<(ops node:$x), (v8f16 (splat_vector (f16 $x)))>;
+ let prefix = "f16x8";
+}
+
+// TODO: Include F16x8 here when half precision is better supported.
 defvar AllVecs = [I8x16, I16x8, I32x4, I64x2, F32x4, F64x2];
 defvar IntVecs = [I8x16, I16x8, I32x4, I64x2];
 
@@ -781,13 +795,19 @@ def : Pat<(v2i64 (nodes[0] (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
 // Bitwise operations
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDBinary<Vec vec, SDPatternOperator node, string name, bits<32> simdop> {
+multiclass SIMDBinary<Vec vec, SDPatternOperator node, string name,
+                      bits<32> simdop, list<Predicate> reqs = []> {
   defm _#vec : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
                       (outs), (ins),
                       [(set (vec.vt V128:$dst),
                         (node (vec.vt V128:$lhs), (vec.vt V128:$rhs)))],
                       vec.prefix#"."#name#"\t$dst, $lhs, $rhs",
-                      vec.prefix#"."#name, simdop>;
+                      vec.prefix#"."#name, simdop, reqs>;
+}
+
+multiclass HalfPrecisionBinary<Vec vec, SDPatternOperator node, string name,
+                               bits<32> simdop> {
+  defm "" : SIMDBinary<vec, node, name, simdop, [HasHalfPrecision]>;
 }
 
 multiclass SIMDBitwise<SDPatternOperator node, string name, bits<32> simdop,
@@ -1199,6 +1219,7 @@ def : Pat<(v2f64 (froundeven (v2f64 V128:$src))), (NEAREST_F64x2 V128:$src)>;
 multiclass SIMDBinaryFP<SDPatternOperator node, string name, bits<32> baseInst> {
   defm "" : SIMDBinary<F32x4, node, name, baseInst>;
   defm "" : SIMDBinary<F64x2, node, name, !add(baseInst, 12)>;
+  defm "" : HalfPrecisionBinary<F16x8, node, name, !add(baseInst, 80)>;
 }
 
 // Addition: add
@@ -1242,7 +1263,7 @@ defm PMAX : SIMDBinaryFP<pmax, "pmax", 235>;
 // Also match the pmin/pmax cases where the operands are int vectors (but the
 // comparison is still a floating point comparison). This can happen when using
 // the wasm_simd128.h intrinsics because v128_t is an integer vector.
-foreach vec = [F32x4, F64x2] in {
+foreach vec = [F32x4, F64x2, F16x8] in {
 defvar pmin = !cast<NI>("PMIN_"#vec);
 defvar pmax = !cast<NI>("PMAX_"#vec);
 def : Pat<(vec.int_vt (vselect
@@ -1266,6 +1287,10 @@ def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
           (PMIN_F64x2 V128:$lhs, V128:$rhs)>;
 def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
           (PMAX_F64x2 V128:$lhs, V128:$rhs)>;
+def : Pat<(v8f16 (int_wasm_pmin (v8f16 V128:$lhs), (v8f16 V128:$rhs))),
+          (PMIN_F16x8 V128:$lhs, V128:$rhs)>;
+def : Pat<(v8f16 (int_wasm_pmax (v8f16 V128:$lhs), (v8f16 V128:$rhs))),
+          (PMAX_F16x8 V128:$lhs, V128:$rhs)>;
 
 //===----------------------------------------------------------------------===//
 // Conversions
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
index 069ce5e3bc94a..02f0ab8577c3d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td
@@ -64,6 +64,8 @@ multiclass TABLE<WebAssemblyRegClass rc, string suffix> {
 
 defm "" : TABLE<FUNCREF, "funcref">, Requires<[HasReferenceTypes]>;
 defm "" : TABLE<EXTERNREF, "externref">, Requires<[HasReferenceTypes]>;
+defm "" : TABLE<EXNREF, "exnref">,
+          Requires<[HasReferenceTypes, HasExceptionHandling]>;
 
 def : Pat<(WebAssemblyTableSet mcsym:$table, i32:$idx, funcref:$r),
           (TABLE_SET_FUNCREF mcsym:$table, i32:$idx, funcref:$r)>,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index ef174e1716ef1..d4edb6bf18d93 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -504,6 +504,8 @@ static unsigned getTeeOpcode(const TargetRegisterClass *RC) {
     return WebAssembly::TEE_EXTERNREF;
   if (RC == &WebAssembly::FUNCREFRegClass)
     return WebAssembly::TEE_FUNCREF;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::TEE_EXNREF;
   llvm_unreachable("Unexpected register class");
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 4e2faa608be07..17889dacc868c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -45,6 +45,7 @@ def V128_0: WebAssemblyReg<"%v128">;
 
 def FUNCREF_0 : WebAssemblyReg<"%funcref.0">;
 def EXTERNREF_0 : WebAssemblyReg<"%externref.0">;
+def EXNREF_0 : WebAssemblyReg<"%exnref.0">;
 
 // The value stack "register". This is an opaque entity which serves to order
 // uses and defs that must remain in LIFO order.
@@ -68,3 +69,4 @@ def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8,
                                128, (add V128_0)>;
 def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>;
 def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>;
+def EXNREF : WebAssemblyRegClass<[exnref], 0, (add EXNREF_0)>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index de342e8965736..fd92a35c2638a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -385,18 +385,36 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
 using WebAssembly::WasmEnableEH;
 using WebAssembly::WasmEnableEmEH;
 using WebAssembly::WasmEnableEmSjLj;
+using WebAssembly::WasmEnableExnref;
 using WebAssembly::WasmEnableSjLj;
 
 static void basicCheckForEHAndSjLj(TargetMachine *TM) {
-  // Before checking, we make sure TargetOptions.ExceptionModel is the same as
+
+  // You can't enable two modes of EH at the same time
+  if (WasmEnableEmEH && WasmEnableEH)
+    report_fatal_error(
+        "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-eh");
+  // You can't enable two modes of SjLj at the same time
+  if (WasmEnableEmSjLj && WasmEnableSjLj)
+    report_fatal_error(
+        "-enable-emscripten-sjlj not allowed with -wasm-enable-sjlj");
+  // You can't mix Emscripten EH with Wasm SjLj.
+  if (WasmEnableEmEH && WasmEnableSjLj)
+    report_fatal_error(
+        "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj");
+  if (WasmEnableExnref && !WasmEnableEH)
+    report_fatal_error(
+        "-wasm-enable-exnref should be used with -wasm-enable-eh");
+
+  // Here we make sure TargetOptions.ExceptionModel is the same as
   // MCAsmInfo.ExceptionsType. Normally these have to be the same, because clang
   // stores the exception model info in LangOptions, which is later transferred
   // to TargetOptions and MCAsmInfo. But when clang compiles bitcode directly,
   // clang's LangOptions is not used and thus the exception model info is not
   // correctly transferred to TargetOptions and MCAsmInfo, so we make sure we
-  // have the correct exception model in WebAssemblyMCAsmInfo constructor.
-  // But in this case TargetOptions is still not updated, so we make sure they
-  // are the same.
+  // have the correct exception model in WebAssemblyMCAsmInfo constructor. But
+  // in this case TargetOptions is still not updated, so we make sure they are
+  // the same.
   TM->Options.ExceptionModel = TM->getMCAsmInfo()->getExceptionHandlingType();
 
   // Basic Correctness checking related to -exception-model
@@ -418,18 +436,6 @@ static void basicCheckForEHAndSjLj(TargetMachine *TM) {
         "-exception-model=wasm only allowed with at least one of "
         "-wasm-enable-eh or -wasm-enable-sjlj");
 
-  // You can't enable two modes of EH at the same time
-  if (WasmEnableEmEH && WasmEnableEH)
-    report_fatal_error(
-        "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-eh");
-  // You can't enable two modes of SjLj at the same time
-  if (WasmEnableEmSjLj && WasmEnableSjLj)
-    report_fatal_error(
-        "-enable-emscripten-sjlj not allowed with -wasm-enable-sjlj");
-  // You can't mix Emscripten EH with Wasm SjLj.
-  if (WasmEnableEmEH && WasmEnableSjLj)
-    report_fatal_error(
-        "-enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj");
   // Currently it is allowed to mix Wasm EH with Emscripten SjLj as an interim
   // measure, but some code will error out at compile time in this combination.
   // See WebAssemblyLowerEmscriptenEHSjLj pass for details.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index 60e872549f87d..5e7279808cce6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -175,6 +175,8 @@ unsigned WebAssembly::getCopyOpcodeForRegClass(const TargetRegisterClass *RC) {
     return WebAssembly::COPY_FUNCREF;
   case WebAssembly::EXTERNREFRegClassID:
     return WebAssembly::COPY_EXTERNREF;
+  case WebAssembly::EXNREFRegClassID:
+    return WebAssembly::COPY_EXNREF;
   default:
     llvm_unreachable("Unexpected register class");
   }
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 6442cc2193308..11b2155e3f985 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -14,6 +14,7 @@
 //   b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX)
 //   c. NDD (EVEX) -> non-NDD (legacy)
 //   d. NF_ND (EVEX) -> NF (EVEX)
+//   e. NonNF (EVEX) -> NF (EVEX)
 //
 // Compression a, b and c can always reduce code size, with some exceptions
 // such as promoted 16-bit CRC32 which is as long as the legacy version.
@@ -30,6 +31,9 @@
 //
 // Compression d can help hardware decode (HW may skip reading the NDD
 // register) although the instruction length remains unchanged.
+//
+// Compression e can help hardware skip updating EFLAGS although the instruction
+// length remains unchanged.
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/X86BaseInfo.h"
@@ -177,7 +181,8 @@ static bool isRedundantNewDataDest(MachineInstr &MI, const X86Subtarget &ST) {
   const MCInstrDesc &Desc = MI.getDesc();
   Register Reg0 = MI.getOperand(0).getReg();
   const MachineOperand &Op1 = MI.getOperand(1);
-  if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1)
+  if (!Op1.isReg() || X86::getFirstAddrOperandIdx(MI) == 1 ||
+      X86::isCFCMOVCC(MI.getOpcode()))
     return false;
   Register Reg1 = Op1.getReg();
   if (Reg1 == Reg0)
@@ -219,25 +224,36 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
     return false;
   // MOVBE*rr is special because it has semantic of NDD but not set EVEX_B.
   bool IsNDLike = IsND || Opc == X86::MOVBE32rr || Opc == X86::MOVBE64rr;
-  if (IsNDLike && !isRedundantNewDataDest(MI, ST))
+  bool IsRedundantNDD = IsNDLike ? isRedundantNewDataDest(MI, ST) : false;
+  // NonNF -> NF only if it's not a compressible NDD instruction and eflags is
+  // dead.
+  unsigned NFOpc = (ST.hasNF() && !IsRedundantNDD &&
+                    MI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr))
+                       ? X86::getNFVariant(Opc)
+                       : 0U;
+  if (IsNDLike && !IsRedundantNDD && !NFOpc)
     return false;
 
-  ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable);
-
-  Opc = MI.getOpcode();
-  const auto *I = llvm::lower_bound(Table, Opc);
-  if (I == Table.end() || I->OldOpc != Opc) {
-    assert(!IsNDLike && "Missing entry for ND-like instruction");
-    return false;
-  }
+  unsigned NewOpc = NFOpc;
+  if (!NewOpc) {
+    ArrayRef<X86TableEntry> Table = ArrayRef(X86CompressEVEXTable);
 
-  if (!IsNDLike) {
-    if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) ||
-        !performCustomAdjustments(MI, I->NewOpc))
+    Opc = MI.getOpcode();
+    const auto I = llvm::lower_bound(Table, Opc);
+    if (I == Table.end() || I->OldOpc != Opc) {
+      assert(!IsNDLike && "Missing entry for ND-like instruction");
       return false;
+    }
+
+    if (!IsNDLike) {
+      if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) ||
+          !performCustomAdjustments(MI, I->NewOpc))
+        return false;
+    }
+    NewOpc = I->NewOpc;
   }
 
-  const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(I->NewOpc);
+  const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(NewOpc);
   MI.setDesc(NewDesc);
   unsigned AsmComment;
   switch (NewDesc.TSFlags & X86II::EncodingMask) {
@@ -256,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
     llvm_unreachable("Unknown EVEX compression");
   }
   MI.setAsmPrinterFlag(AsmComment);
-  if (IsNDLike)
+  if (IsRedundantNDD)
     MI.tieOperands(0, 1);
 
   return true;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 7d05f950b6fe9..3e391da807889 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3221,6 +3221,14 @@ int X86::getCCMPCondFlagsFromCondCode(X86::CondCode CC) {
   }
 }
 
+#define GET_X86_NF_TRANSFORM_TABLE
+#include "X86GenInstrMapping.inc"
+unsigned X86::getNFVariant(unsigned Opc) {
+  ArrayRef<X86TableEntry> Table = ArrayRef(X86NFTransformTable);
+  const auto I = llvm::lower_bound(Table, Opc);
+  return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
+}
+
 /// Return the inverse of the specified condition,
 /// e.g. turning COND_E to COND_NE.
 X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 295fac60c6e40..9eb2bd56b2ab5 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -77,6 +77,9 @@ CondCode getCondFromCCMP(const MachineInstr &MI);
 // Turn condition code into condition flags for CCMP/CTEST.
 int getCCMPCondFlagsFromCondCode(CondCode CC);
 
+// Get the opcode of corresponding NF variant.
+unsigned getNFVariant(unsigned Opc);
+
 /// GetOppositeBranchCondition - Return the inverse of the specified cond,
 /// e.g. turning COND_E to COND_NE.
 CondCode GetOppositeBranchCondition(CondCode CC);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d935be7669f05..3b18e39d784b2 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6257,7 +6257,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
                                 AddressSpace, CostKind);
 
   unsigned VF = VecTy->getNumElements() / Factor;
-  MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
+  MVT VT =
+      MVT::getVectorVT(TLI->getSimpleValueType(DL, VecTy->getScalarType()), VF);
 
   InstructionCost MaskCost;
   if (UseMaskedMemOp) {
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index c10b4be4eded9..ca356ec82bf1f 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -181,12 +181,6 @@ void AArch64::ExtensionSet::enable(ArchExtKind E) {
         !BaseArch->is_superset(ARMV9A))
       enable(AEK_FP16FML);
 
-    // For all architectures, +crypto enables +aes and +sha2.
-    if (E == AEK_CRYPTO) {
-      enable(AEK_AES);
-      enable(AEK_SHA2);
-    }
-
     // For v8.4A+ and v9.0A+, +crypto also enables +sha3 and +sm4.
     if (E == AEK_CRYPTO && BaseArch->is_superset(ARMV8_4A)) {
       enable(AEK_SHA3);
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 38b8dab984db3..8e829a53aeca2 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -2756,12 +2756,11 @@ static void sinkSpillUsesAfterCoroBegin(Function &F,
 /// after the suspend block. Doing so minimizes the lifetime of each variable,
 /// hence minimizing the amount of data we end up putting on the frame.
 static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
-                                     SuspendCrossingInfo &Checker) {
+                                     SuspendCrossingInfo &Checker,
+                                     const DominatorTree &DT) {
   if (F.hasOptNone())
     return;
 
-  DominatorTree DT(F);
-
   // Collect all possible basic blocks which may dominate all uses of allocas.
   SmallPtrSet<BasicBlock *, 4> DomSet;
   DomSet.insert(&F.getEntryBlock());
@@ -3149,12 +3148,13 @@ void coro::buildCoroutineFrame(
 
   doRematerializations(F, Checker, MaterializableCallback);
 
+  const DominatorTree DT(F);
   FrameDataInfo FrameData;
   SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas;
   SmallVector<Instruction*, 4> DeadInstructions;
   if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
       Shape.ABI != coro::ABI::RetconOnce)
-    sinkLifetimeStartMarkers(F, Shape, Checker);
+    sinkLifetimeStartMarkers(F, Shape, Checker, DT);
 
   // Collect the spills for arguments and other not-materializable values.
   for (Argument &A : F.args())
@@ -3162,7 +3162,6 @@ void coro::buildCoroutineFrame(
       if (Checker.isDefinitionAcrossSuspend(A, U))
         FrameData.Spills[&A].push_back(cast<Instruction>(U));
 
-  const DominatorTree DT(F);
   for (Instruction &I : instructions(F)) {
     // Values returned from coroutine structure intrinsics should not be part
     // of the Coroutine Frame.
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 633fcb3314c42..f86f217bca588 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -879,7 +879,7 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
     // Multiply by 2 to account for padding elements.
     Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0),
                                       ConstantInt::get(Int32Ty, I * 2)};
-    Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr(
+    Constant *CombinedGlobalElemPtr = ConstantExpr::getInBoundsGetElementPtr(
         NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs);
     assert(GV->getType()->getAddressSpace() == 0);
     GlobalAlias *GAlias =
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 0920179fb76b7..92ad4c34da6e7 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1391,10 +1391,11 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
       return InlineCost::getAlways("preinliner");
   }
 
-  // For old FDO inliner, we inline the call site as long as cost is not
-  // "Never". The cost-benefit check is done earlier.
+  // For old FDO inliner, we inline the call site if it is below hot threshold,
+  // even if the function is hot based on sample profile data. This is to
+  // prevent huge functions from being inlined.
   if (!CallsitePrioritizedInline) {
-    return InlineCost::get(Cost.getCost(), INT_MAX);
+    return InlineCost::get(Cost.getCost(), SampleHotCallSiteThreshold);
   }
 
   // Otherwise only use the cost from call analyzer, but overwite threshold with
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index e7a188e9431db..9929ebb96dcaf 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1927,7 +1927,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
   // element (the original initializer).
   auto Alias = GlobalAlias::create(
       B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "",
-      ConstantExpr::getGetElementPtr(
+      ConstantExpr::getInBoundsGetElementPtr(
           NewInit->getType(), NewGV,
           ArrayRef<Constant *>{ConstantInt::get(Int32Ty, 0),
                                ConstantInt::get(Int32Ty, 1)}),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c3272d97509f5..89193f8ff94b6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4730,6 +4730,21 @@ static Instruction *foldICmpAndXX(ICmpInst &I, const SimplifyQuery &Q,
   if (Pred == ICmpInst::ICMP_UGE)
     return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
 
+  if (ICmpInst::isEquality(Pred) && Op0->hasOneUse()) {
+    // icmp (X & Y) eq/ne Y --> (X | ~Y) eq/ne -1 if Y is freely invertible and
+    // Y is non-constant. If Y is constant the `X & C == C` form is preferable
+    // so don't do this fold.
+    if (!match(Op1, m_ImmConstant()))
+      if (auto *NotOp1 =
+              IC.getFreelyInverted(Op1, !Op1->hasNUsesOrMore(3), &IC.Builder))
+        return new ICmpInst(Pred, IC.Builder.CreateOr(A, NotOp1),
+                            Constant::getAllOnesValue(Op1->getType()));
+    // icmp (X & Y) eq/ne Y --> (~X & Y) eq/ne 0 if X  is freely invertible.
+    if (auto *NotA = IC.getFreelyInverted(A, A->hasOneUse(), &IC.Builder))
+      return new ICmpInst(Pred, IC.Builder.CreateAnd(Op1, NotA),
+                          Constant::getNullValue(Op1->getType()));
+  }
+
   return nullptr;
 }
 
@@ -5505,21 +5520,6 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
     }
   }
 
-  // canoncalize:
-  // (icmp eq/ne (and X, C), X)
-  //    -> (icmp eq/ne (and X, ~C), 0)
-  {
-    Constant *CMask;
-    A = nullptr;
-    if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_ImmConstant(CMask)))))
-      A = Op1;
-    else if (match(Op1, m_OneUse(m_And(m_Specific(Op0), m_ImmConstant(CMask)))))
-      A = Op0;
-    if (A)
-      return new ICmpInst(Pred, Builder.CreateAnd(A, Builder.CreateNot(CMask)),
-                          Constant::getNullValue(A->getType()));
-  }
-
   if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
     // A == (A^B)  ->  B == 0
     Value *OtherVal = A == Op0 ? B : A;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 0f1979fbe0c76..4f91993750fd2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1461,13 +1461,24 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
 
     const APInt *MulC;
     if (match(Op0, m_NUWMul(m_Value(X), m_APInt(MulC)))) {
-      // Look for a "splat" mul pattern - it replicates bits across each half of
-      // a value, so a right shift is just a mask of the low bits:
-      // lshr i[2N] (mul nuw X, (2^N)+1), N --> and iN X, (2^N)-1
-      // TODO: Generalize to allow more than just half-width shifts?
-      if (BitWidth > 2 && ShAmtC * 2 == BitWidth && (*MulC - 1).isPowerOf2() &&
-          MulC->logBase2() == ShAmtC)
-        return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2));
+      if (BitWidth > 2 && (*MulC - 1).isPowerOf2() &&
+          MulC->logBase2() == ShAmtC) {
+        // Look for a "splat" mul pattern - it replicates bits across each half
+        // of a value, so a right shift is just a mask of the low bits:
+        // lshr i[2N] (mul nuw X, (2^N)+1), N --> and iN X, (2^N)-1
+        if (ShAmtC * 2 == BitWidth)
+          return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2));
+
+        // lshr (mul nuw (X, 2^N + 1)), N -> add nuw (X, lshr(X, N))
+        if (Op0->hasOneUse()) {
+          auto *NewAdd = BinaryOperator::CreateNUWAdd(
+              X, Builder.CreateLShr(X, ConstantInt::get(Ty, ShAmtC), "",
+                                    I.isExact()));
+          NewAdd->setHasNoSignedWrap(
+              cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap());
+          return NewAdd;
+        }
+      }
 
       // The one-use check is not strictly necessary, but codegen may not be
       // able to invert the transform and perf may suffer with an extra mul
@@ -1487,6 +1498,16 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
       }
     }
 
+    // lshr (mul nsw (X, 2^N + 1)), N -> add nsw (X, lshr(X, N))
+    if (match(Op0, m_OneUse(m_NSWMul(m_Value(X), m_APInt(MulC))))) {
+      if (BitWidth > 2 && (*MulC - 1).isPowerOf2() &&
+          MulC->logBase2() == ShAmtC) {
+        return BinaryOperator::CreateNSWAdd(
+            X, Builder.CreateLShr(X, ConstantInt::get(Ty, ShAmtC), "",
+                                  I.isExact()));
+      }
+    }
+
     // Try to narrow bswap.
     // In the case where the shift amount equals the bitwidth difference, the
     // shift is eliminated.
@@ -1690,6 +1711,21 @@ Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
       if (match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y)))))
         return new SExtInst(Builder.CreateICmpSLT(X, Y), Ty);
     }
+
+    const APInt *MulC;
+    if (match(Op0, m_OneUse(m_NSWMul(m_Value(X), m_APInt(MulC)))) &&
+        (BitWidth > 2 && (*MulC - 1).isPowerOf2() &&
+         MulC->logBase2() == ShAmt &&
+         (ShAmt < BitWidth - 1))) /* Minus 1 for the sign bit */ {
+
+      // ashr (mul nsw (X, 2^N + 1)), N -> add nsw (X, ashr(X, N))
+      auto *NewAdd = BinaryOperator::CreateNSWAdd(
+          X,
+          Builder.CreateAShr(X, ConstantInt::get(Ty, ShAmt), "", I.isExact()));
+      NewAdd->setHasNoUnsignedWrap(
+          cast<OverflowingBinaryOperator>(Op0)->hasNoUnsignedWrap());
+      return NewAdd;
+    }
   }
 
   const SimplifyQuery Q = SQ.getWithInstruction(&I);
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 20d11e0ab55f2..f0b0917a25938 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1804,8 +1804,8 @@ Value *DFSanFunction::getRetvalTLS(Type *T, IRBuilder<> &IRB) {
 Value *DFSanFunction::getRetvalOriginTLS() { return DFS.RetvalOriginTLS; }
 
 Value *DFSanFunction::getArgOriginTLS(unsigned ArgNo, IRBuilder<> &IRB) {
-  return IRB.CreateConstGEP2_64(DFS.ArgOriginTLSTy, DFS.ArgOriginTLS, 0, ArgNo,
-                                "_dfsarg_o");
+  return IRB.CreateConstInBoundsGEP2_64(DFS.ArgOriginTLSTy, DFS.ArgOriginTLS, 0,
+                                        ArgNo, "_dfsarg_o");
 }
 
 Value *DFSanFunction::getOrigin(Value *V) {
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 6aa4188d1cc4d..5eccf7b4adb65 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2751,7 +2751,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
   IRBuilder<> Builder(Preheader->getTerminator());
   for (auto *U : Changes) {
     assert(L.isLoopInvariant(U->get()));
-    Instruction *Ins = cast<Instruction>(U->getUser());
+    auto *Ins = cast<BinaryOperator>(U->getUser());
     Value *Mul;
     if (I.getType()->isIntOrIntVectorTy()) {
       Mul = Builder.CreateMul(U->get(), Factor, "factor.op.mul");
@@ -2759,8 +2759,20 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
       Ins->dropPoisonGeneratingFlags();
     } else
       Mul = Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul");
-    U->set(Mul);
+
+    // Rewrite the reassociable instruction.
+    unsigned OpIdx = U->getOperandNo();
+    auto *LHS = OpIdx == 0 ? Mul : Ins->getOperand(0);
+    auto *RHS = OpIdx == 1 ? Mul : Ins->getOperand(1);
+    auto *NewBO = BinaryOperator::Create(Ins->getOpcode(), LHS, RHS,
+                                         Ins->getName() + ".reass", Ins);
+    NewBO->copyIRFlags(Ins);
+    if (VariantOp == Ins)
+      VariantOp = NewBO;
+    Ins->replaceAllUsesWith(NewBO);
+    eraseInstruction(*Ins, SafetyInfo, MSSAU);
   }
+
   I.replaceAllUsesWith(VariantOp);
   eraseInstruction(I, SafetyInfo, MSSAU);
   return true;
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index c7e25c9f3d2c9..3fe5478408d45 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -22,8 +22,6 @@
 //
 // Future loop memory idioms to recognize:
 //   memcmp, strlen, etc.
-// Future floating point idioms to recognize in -ffast-math mode:
-//   fpowi
 //
 // This could recognize common matrix multiplies and dot product idioms and
 // replace them with calls to BLAS (if linked in??).
@@ -1107,7 +1105,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     GV->setAlignment(Align(16));
     Value *PatternPtr = GV;
     NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
-    
+
     // Set the TBAA info if present.
     if (AATags.TBAA)
       NewCall->setMetadata(LLVMContext::MD_tbaa, AATags.TBAA);
@@ -1117,7 +1115,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
 
     if (AATags.NoAlias)
       NewCall->setMetadata(LLVMContext::MD_noalias, AATags.NoAlias);
-  } 
+  }
 
   NewCall->setDebugLoc(TheStore->getDebugLoc());
 
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index d91320863e241..04c54ed69e93f 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -302,97 +302,6 @@ static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
   return Res;
 }
 
-/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
-/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
-/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
-/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
-/// even x in Bitwidth-bit arithmetic.
-static unsigned CarmichaelShift(unsigned Bitwidth) {
-  if (Bitwidth < 3)
-    return Bitwidth - 1;
-  return Bitwidth - 2;
-}
-
-/// Add the extra weight 'RHS' to the existing weight 'LHS',
-/// reducing the combined weight using any special properties of the operation.
-/// The existing weight LHS represents the computation X op X op ... op X where
-/// X occurs LHS times.  The combined weight represents  X op X op ... op X with
-/// X occurring LHS + RHS times.  If op is "Xor" for example then the combined
-/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
-/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
-static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
-  // If we were working with infinite precision arithmetic then the combined
-  // weight would be LHS + RHS.  But we are using finite precision arithmetic,
-  // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
-  // for nilpotent operations and addition, but not for idempotent operations
-  // and multiplication), so it is important to correctly reduce the combined
-  // weight back into range if wrapping would be wrong.
-
-  // If RHS is zero then the weight didn't change.
-  if (RHS.isMinValue())
-    return;
-  // If LHS is zero then the combined weight is RHS.
-  if (LHS.isMinValue()) {
-    LHS = RHS;
-    return;
-  }
-  // From this point on we know that neither LHS nor RHS is zero.
-
-  if (Instruction::isIdempotent(Opcode)) {
-    // Idempotent means X op X === X, so any non-zero weight is equivalent to a
-    // weight of 1.  Keeping weights at zero or one also means that wrapping is
-    // not a problem.
-    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
-    return; // Return a weight of 1.
-  }
-  if (Instruction::isNilpotent(Opcode)) {
-    // Nilpotent means X op X === 0, so reduce weights modulo 2.
-    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
-    LHS = 0; // 1 + 1 === 0 modulo 2.
-    return;
-  }
-  if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
-    // TODO: Reduce the weight by exploiting nsw/nuw?
-    LHS += RHS;
-    return;
-  }
-
-  assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
-         "Unknown associative operation!");
-  unsigned Bitwidth = LHS.getBitWidth();
-  // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
-  // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth
-  // bit number x, since either x is odd in which case x^CM = 1, or x is even in
-  // which case both x^W and x^(W - CM) are zero.  By subtracting off multiples
-  // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
-  // which by a happy accident means that they can always be represented using
-  // Bitwidth bits.
-  // TODO: Reduce the weight by exploiting nsw/nuw?  (Could do much better than
-  // the Carmichael number).
-  if (Bitwidth > 3) {
-    /// CM - The value of Carmichael's lambda function.
-    APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
-    // Any weight W >= Threshold can be replaced with W - CM.
-    APInt Threshold = CM + Bitwidth;
-    assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
-    // For Bitwidth 4 or more the following sum does not overflow.
-    LHS += RHS;
-    while (LHS.uge(Threshold))
-      LHS -= CM;
-  } else {
-    // To avoid problems with overflow do everything the same as above but using
-    // a larger type.
-    unsigned CM = 1U << CarmichaelShift(Bitwidth);
-    unsigned Threshold = CM + Bitwidth;
-    assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
-           "Weights not reduced!");
-    unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
-    while (Total >= Threshold)
-      Total -= CM;
-    LHS = Total;
-  }
-}
-
 using RepeatedValue = std::pair<Value*, APInt>;
 
 /// Given an associative binary expression, return the leaf
@@ -471,7 +380,7 @@ using RepeatedValue = std::pair<Value*, APInt>;
 static bool LinearizeExprTree(Instruction *I,
                               SmallVectorImpl<RepeatedValue> &Ops,
                               ReassociatePass::OrderedSet &ToRedo,
-                              bool &HasNUW) {
+                              reassociate::OverflowTracking &Flags) {
   assert((isa<UnaryOperator>(I) || isa<BinaryOperator>(I)) &&
          "Expected a UnaryOperator or BinaryOperator!");
   LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
@@ -512,6 +421,7 @@ static bool LinearizeExprTree(Instruction *I,
   using LeafMap = DenseMap<Value *, APInt>;
   LeafMap Leaves; // Leaf -> Total weight so far.
   SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
+  const DataLayout DL = I->getModule()->getDataLayout();
 
 #ifndef NDEBUG
   SmallPtrSet<Value *, 8> Visited; // For checking the iteration scheme.
@@ -520,8 +430,10 @@ static bool LinearizeExprTree(Instruction *I,
     std::pair<Instruction*, APInt> P = Worklist.pop_back_val();
     I = P.first; // We examine the operands of this binary operator.
 
-    if (isa<OverflowingBinaryOperator>(I))
-      HasNUW &= I->hasNoUnsignedWrap();
+    if (isa<OverflowingBinaryOperator>(I)) {
+      Flags.HasNUW &= I->hasNoUnsignedWrap();
+      Flags.HasNSW &= I->hasNoSignedWrap();
+    }
 
     for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx) { // Visit operands.
       Value *Op = I->getOperand(OpIdx);
@@ -559,26 +471,7 @@ static bool LinearizeExprTree(Instruction *I,
                "In leaf map but not visited!");
 
         // Update the number of paths to the leaf.
-        IncorporateWeight(It->second, Weight, Opcode);
-
-#if 0   // TODO: Re-enable once PR13021 is fixed.
-        // The leaf already has one use from inside the expression.  As we want
-        // exactly one such use, drop this new use of the leaf.
-        assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
-        I->setOperand(OpIdx, UndefValue::get(I->getType()));
-        Changed = true;
-
-        // If the leaf is a binary operation of the right kind and we now see
-        // that its multiple original uses were in fact all by nodes belonging
-        // to the expression, then no longer consider it to be a leaf and add
-        // its operands to the expression.
-        if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
-          LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
-          Worklist.push_back(std::make_pair(BO, It->second));
-          Leaves.erase(It);
-          continue;
-        }
-#endif
+        It->second += Weight;
 
         // If we still have uses that are not accounted for by the expression
         // then it is not safe to modify the value.
@@ -648,6 +541,8 @@ static bool LinearizeExprTree(Instruction *I,
     // Ensure the leaf is only output once.
     It->second = 0;
     Ops.push_back(std::make_pair(V, Weight));
+    if (Opcode == Instruction::Add && Flags.AllKnownNonNegative && Flags.HasNSW)
+      Flags.AllKnownNonNegative &= isKnownNonNegative(V, SimplifyQuery(DL));
   }
 
   // For nilpotent operations or addition there may be no operands, for example
@@ -666,7 +561,7 @@ static bool LinearizeExprTree(Instruction *I,
 /// linearized and optimized, emit them in-order.
 void ReassociatePass::RewriteExprTree(BinaryOperator *I,
                                       SmallVectorImpl<ValueEntry> &Ops,
-                                      bool HasNUW) {
+                                      OverflowTracking Flags) {
   assert(Ops.size() > 1 && "Single values should be used directly!");
 
   // Since our optimizations should never increase the number of operations, the
@@ -834,8 +729,12 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
           // Note that it doesn't hold for mul if one of the operands is zero.
           // TODO: We can preserve NUW flag if we prove that all mul operands
           // are non-zero.
-          if (HasNUW && ExpressionChangedStart->getOpcode() == Instruction::Add)
-            ExpressionChangedStart->setHasNoUnsignedWrap();
+          if (ExpressionChangedStart->getOpcode() == Instruction::Add) {
+            if (Flags.HasNUW)
+              ExpressionChangedStart->setHasNoUnsignedWrap();
+            if (Flags.HasNSW && (Flags.AllKnownNonNegative || Flags.HasNUW))
+              ExpressionChangedStart->setHasNoSignedWrap();
+          }
         }
       }
 
@@ -1192,8 +1091,8 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
     return nullptr;
 
   SmallVector<RepeatedValue, 8> Tree;
-  bool HasNUW = true;
-  MadeChange |= LinearizeExprTree(BO, Tree, RedoInsts, HasNUW);
+  OverflowTracking Flags;
+  MadeChange |= LinearizeExprTree(BO, Tree, RedoInsts, Flags);
   SmallVector<ValueEntry, 8> Factors;
   Factors.reserve(Tree.size());
   for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
@@ -1235,7 +1134,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
 
   if (!FoundFactor) {
     // Make sure to restore the operands to the expression tree.
-    RewriteExprTree(BO, Factors, HasNUW);
+    RewriteExprTree(BO, Factors, Flags);
     return nullptr;
   }
 
@@ -1247,7 +1146,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
     RedoInsts.insert(BO);
     V = Factors[0].Op;
   } else {
-    RewriteExprTree(BO, Factors, HasNUW);
+    RewriteExprTree(BO, Factors, Flags);
     V = BO;
   }
 
@@ -2373,8 +2272,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
   // First, walk the expression tree, linearizing the tree, collecting the
   // operand information.
   SmallVector<RepeatedValue, 8> Tree;
-  bool HasNUW = true;
-  MadeChange |= LinearizeExprTree(I, Tree, RedoInsts, HasNUW);
+  OverflowTracking Flags;
+  MadeChange |= LinearizeExprTree(I, Tree, RedoInsts, Flags);
   SmallVector<ValueEntry, 8> Ops;
   Ops.reserve(Tree.size());
   for (const RepeatedValue &E : Tree)
@@ -2567,7 +2466,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
              dbgs() << '\n');
   // Now that we ordered and optimized the expressions, splat them back into
   // the expression tree, removing any unneeded nodes.
-  RewriteExprTree(I, Ops, HasNUW);
+  RewriteExprTree(I, Ops, Flags);
 }
 
 void
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 7b846f2d2d72d..eb471b259c7d4 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -30,11 +30,12 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/Argument.h"
+#include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -1450,6 +1451,8 @@ static AttrBuilder IdentifyValidPoisonGeneratingAttributes(CallBase &CB) {
     Valid.addAttribute(Attribute::NonNull);
   if (CB.hasRetAttr(Attribute::Alignment))
     Valid.addAlignmentAttr(CB.getRetAlign());
+  if (std::optional<ConstantRange> Range = CB.getRange())
+    Valid.addRangeAttr(*Range);
   return Valid;
 }
 
@@ -1541,6 +1544,14 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
     if (ValidPG.getAlignment().valueOrOne() < AL.getRetAlignment().valueOrOne())
       ValidPG.removeAttribute(Attribute::Alignment);
     if (ValidPG.hasAttributes()) {
+      Attribute CBRange = ValidPG.getAttribute(Attribute::Range);
+      if (CBRange.isValid()) {
+        Attribute NewRange = AL.getRetAttr(Attribute::Range);
+        if (NewRange.isValid()) {
+          ValidPG.addRangeAttr(
+              CBRange.getRange().intersectWith(NewRange.getRange()));
+        }
+      }
       // Three checks.
       // If the callsite has `noundef`, then a poison due to violating the
       // return attribute will create UB anyways so we can always propagate.
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 056f0d6b3ee6c..7ecfe5218ef67 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1746,6 +1746,9 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
     return Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
   if (auto *SI = dyn_cast<SelectInst>(I))
     return Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
+  if (auto *CI = dyn_cast<CastInst>(I))
+    return Builder.CreateCast((Instruction::CastOps)CI->getOpcode(), Ops[0],
+                              DstTy);
   if (II)
     return Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
   assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
@@ -1757,8 +1760,7 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
 // do so.
 bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
   auto *Ty = dyn_cast<FixedVectorType>(I.getType());
-  if (!Ty || !isa<Instruction>(I.getOperand(0)) ||
-      !isa<Instruction>(I.getOperand(1)))
+  if (!Ty)
     return false;
 
   SmallVector<InstLane> Start(Ty->getNumElements());
@@ -1847,7 +1849,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
         isa<CmpInst>(FrontV)) {
       Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
       Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
-    } else if (isa<UnaryOperator>(FrontV)) {
+    } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontV)) {
       Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
     } else if (isa<SelectInst>(FrontV)) {
       Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
diff --git a/llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll b/llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll
new file mode 100644
index 0000000000000..d0d414a869636
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/handle-iptr-with-data-layout-to-not-assert.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9] for VF [0-9] For instruction:\s*store ptr %[0-9], ptr %__last" --filter "LV: Found an estimated cost of [0-9] for VF [0-9] For instruction:\s*store ptr %[0-9]" --version 5
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @foo(ptr %__first, ptr %__last) #0 {
+; CHECK-LABEL: 'foo'
+; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: store ptr %0, ptr %__last, align 8
+; CHECK:  LV: Found an estimated cost of 2 for VF 2 For instruction: store ptr %0, ptr %__last, align 8
+; CHECK:  LV: Found an estimated cost of 3 for VF 4 For instruction: store ptr %0, ptr %__last, align 8
+; CHECK:  LV: Found an estimated cost of 3 for VF 8 For instruction: store ptr %0, ptr %__last, align 8
+;
+entry:
+  %cmp.not1 = icmp eq ptr %__first, %__last
+  br i1 %cmp.not1, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %__first.addr.02 = phi ptr [ %incdec.ptr, %for.body ], [ %__first, %for.body.preheader ]
+  %0 = load ptr, ptr %__first.addr.02, align 8
+  store ptr %0, ptr %__last, align 8
+  %incdec.ptr = getelementptr inbounds i8, ptr %__first.addr.02, i64 16
+  %cmp.not = icmp eq ptr %incdec.ptr, %__last
+  br i1 %cmp.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret ptr null
+}
+
+attributes #0 = { "target-cpu"="znver4" }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index 809b15b200495..81d8b01fe7fb7 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -130,8 +130,16 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
 ; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:  Unknown data dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %ld.i64 = load i64, ptr %gep.iv, align 8 ->
+; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
+; CHECK-EMPTY:
 ; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
 ; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
 ; CHECK-NEXT:            store double %val, ptr %gep.iv.101.i64, align 8
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
index 845ff078ee0eb..416742a94e0d3 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/non-constant-strides-backward.ll
@@ -45,8 +45,13 @@ exit:
 define void @different_non_constant_strides_known_backward_distance_larger_than_trip_count(ptr %A) {
 ; CHECK-LABEL: 'different_non_constant_strides_known_backward_distance_larger_than_trip_count'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %gep.mul.2, align 4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
new file mode 100644
index 0000000000000..8dc79a54eb97a
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/predicated-symbolic-max-backedge-taken-count.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s
+
+; %i and %i + 1 can overflow.
+define void @test1(i64 %x, ptr %a, ptr %b) {
+; CHECK-LABEL: 'test1'
+; CHECK-NEXT:  Determining loop execution counts for: @test1
+; CHECK-NEXT:  Loop %header: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:    exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:    symbolic max exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    symbolic max exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+;
+entry:
+  br label %header
+
+header:
+  %conv11 = phi i64 [ 0, %entry ], [ %conv, %latch ]
+  %i.010 = phi i32 [ 0, %entry ], [ %add, %latch ]
+  %add = add i32 %i.010, 1
+  %idxprom = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+  %ld = load i32, ptr %arrayidx, align 4
+  %uncountable.c = icmp eq i32 %ld, 10
+  br i1 %uncountable.c, label %exit, label %latch
+
+latch:
+  %add2 = add nsw i32 %ld, 1
+  %arrayidx4 = getelementptr inbounds i32, ptr %b, i64 %conv11
+  store i32 %add2, ptr %arrayidx4, align 4
+  %conv = zext i32 %add to i64
+  %cmp = icmp ult i64 %conv, %x
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; %i can overflow.
+;
+; We need to check that i doesn't wrap, but we don't need a run-time alias
+; check. We also need an extra no-wrap check to get the backedge taken count.
+define void @test2(i64 %x, ptr %a) {
+; CHECK-LABEL: 'test2'
+; CHECK-NEXT:  Determining loop execution counts for: @test2
+; CHECK-NEXT:  Loop %header: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:    exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:    symbolic max exit count for header: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    symbolic max exit count for latch: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (-1 + (1 umax %x))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+;
+entry:
+  br label %header
+
+header:
+  %conv11 = phi i64 [ 0, %entry ], [ %conv, %latch ]
+  %i.010 = phi i32  [ 0, %entry ], [ %inc, %latch ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %conv11
+  %ld = load i32, ptr %arrayidx, align 4
+  %uncountable.c = icmp eq i32 %ld, 10
+  br i1 %uncountable.c, label %exit, label %latch
+
+latch:
+  %add = add nsw i32 %ld, 1
+  store i32 %add, ptr %arrayidx, align 4
+  %inc = add i32 %i.010, 1
+  %conv = zext i32 %inc to i64
+  %cmp = icmp ult i64 %conv, %x
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
index 335026dc9b62b..efad77b684a75 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/irreducible/diverged-entry-headers.ll
@@ -90,7 +90,7 @@ S:
   br i1 %cond.uni, label %exit, label %T
 
 T:
-; CHECK-NIT:   DIVERGENT:   %tt.phi = phi i32
+; CHECK-NOT:   DIVERGENT:   %tt.phi = phi i32
   %tt.phi = phi i32 [ %ss, %S ], [ %a, %entry ]
   %tt = add i32 %b, 1
   br label %P
diff --git a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
index 47c4587f6991b..ba3a484441e9e 100644
--- a/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
+++ b/llvm/test/Analysis/ValueTracking/known-power-of-two-urem.ll
@@ -428,9 +428,9 @@ define i8 @known_power_of_two_lshr_add_one_allow_zero(i8 %x, i8 %y) {
 define i1 @known_power_of_two_lshr_add_one_nuw_deny_zero(i8 %x, i8 %y) {
 ; CHECK-LABEL: @known_power_of_two_lshr_add_one_nuw_deny_zero(
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
-; CHECK-NEXT:    [[P:%.*]] = add nuw i8 [[TMP1]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[P]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 -2, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP3]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %4 = lshr i8 -1, %x
@@ -445,9 +445,9 @@ define i1 @known_power_of_two_lshr_add_one_nuw_deny_zero(i8 %x, i8 %y) {
 define i1 @negative_known_power_of_two_lshr_add_one_deny_zero(i8 %x, i8 %y) {
 ; CHECK-LABEL: @negative_known_power_of_two_lshr_add_one_deny_zero(
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
-; CHECK-NEXT:    [[P:%.*]] = add i8 [[TMP1]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[P]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[P]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 -2, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP3]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %4 = lshr i8 -1, %x
@@ -462,9 +462,9 @@ define i1 @negative_known_power_of_two_lshr_add_one_deny_zero(i8 %x, i8 %y) {
 define i1 @negative_known_power_of_two_lshr_add_one_nsw_deny_zero(i8 %x, i8 %y) {
 ; CHECK-LABEL: @negative_known_power_of_two_lshr_add_one_nsw_deny_zero(
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[X:%.*]]
-; CHECK-NEXT:    [[P:%.*]] = add nsw i8 [[TMP1]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[P]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[P]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 -2, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP2]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP3]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %4 = lshr i8 -1, %x
diff --git a/llvm/test/Assembler/invalid-ptrauth-const1.ll b/llvm/test/Assembler/invalid-ptrauth-const1.ll
new file mode 100644
index 0000000000000..fba2e23078238
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const1.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth base pointer must be a pointer
+@auth_var = global ptr ptrauth (i32 42, i32 0)
diff --git a/llvm/test/Assembler/invalid-ptrauth-const2.ll b/llvm/test/Assembler/invalid-ptrauth-const2.ll
new file mode 100644
index 0000000000000..4499c42601c99
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const2.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth key must be i32 constant
+@auth_var = global ptr ptrauth (ptr @var, i32 ptrtoint (ptr @var to i32))
diff --git a/llvm/test/Assembler/invalid-ptrauth-const3.ll b/llvm/test/Assembler/invalid-ptrauth-const3.ll
new file mode 100644
index 0000000000000..3f2688d92a001
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const3.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth address discriminator must be a pointer
+@auth_var = global ptr ptrauth (ptr @var, i32 2, i64 65535, i8 0)
diff --git a/llvm/test/Assembler/invalid-ptrauth-const4.ll b/llvm/test/Assembler/invalid-ptrauth-const4.ll
new file mode 100644
index 0000000000000..843a220458a61
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const4.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth integer discriminator must be i64 constant
+@auth_var = global ptr ptrauth (ptr @var, i32 2, ptr null, i64 ptrtoint (ptr @var to i64))
diff --git a/llvm/test/Assembler/invalid-ptrauth-const5.ll b/llvm/test/Assembler/invalid-ptrauth-const5.ll
new file mode 100644
index 0000000000000..9b47f6f5f423f
--- /dev/null
+++ b/llvm/test/Assembler/invalid-ptrauth-const5.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: error: constant ptrauth integer discriminator must be i64 constant
+@auth_var = global ptr ptrauth (ptr @var, i32 2, ptr @var))
diff --git a/llvm/test/Assembler/non-global-value-max-name-size-2.ll b/llvm/test/Assembler/non-global-value-max-name-size-2.ll
new file mode 100644
index 0000000000000..5eac003ddb438
--- /dev/null
+++ b/llvm/test/Assembler/non-global-value-max-name-size-2.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -S -passes='always-inline' -non-global-value-max-name-size=5 | opt -non-global-value-max-name-size=5 -passes=verify -disable-output
+
+; Opt should not generate too long name for labels during inlining.
+
+define internal i32 @inner(i32 %flag) alwaysinline {
+entry:
+  %icmp = icmp slt i32 %flag, 0
+  br i1 %icmp, label %one, label %two
+
+one:
+  ret i32 42
+
+two:
+  ret i32 44
+}
+
+define i32 @outer(i32 %x) {
+entry:
+  %call1 = call i32 @inner(i32 %x)
+  %call2 = call i32 @inner(i32 %x)
+  %ret = add i32 %call1, %call2
+  ret i32 %ret
+}
\ No newline at end of file
diff --git a/llvm/test/Assembler/ptrauth-const.ll b/llvm/test/Assembler/ptrauth-const.ll
new file mode 100644
index 0000000000000..94d35146d5927
--- /dev/null
+++ b/llvm/test/Assembler/ptrauth-const.ll
@@ -0,0 +1,24 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+@var = global i32 0
+
+; CHECK: @basic = global ptr ptrauth (ptr @var, i32 0)
+@basic = global ptr ptrauth (ptr @var, i32 0)
+
+; CHECK: @keyed = global ptr ptrauth (ptr @var, i32 3)
+@keyed = global ptr ptrauth (ptr @var, i32 3)
+
+; CHECK: @intdisc = global ptr ptrauth (ptr @var, i32 0, i64 -1)
+@intdisc = global ptr ptrauth (ptr @var, i32 0, i64 -1)
+
+; CHECK: @addrdisc = global ptr ptrauth (ptr @var, i32 2, i64 1234, ptr @addrdisc)
+@addrdisc = global ptr ptrauth (ptr @var, i32 2, i64 1234, ptr @addrdisc)
+
+
+@var1 = addrspace(1) global i32 0
+
+; CHECK: @addrspace = global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 0)
+@addrspace = global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 0)
+
+; CHECK: @addrspace_addrdisc = addrspace(2) global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 2, i64 1234, ptr addrspace(2) @addrspace_addrdisc)
+@addrspace_addrdisc = addrspace(2) global ptr addrspace(1) ptrauth (ptr addrspace(1) @var1, i32 2, i64 1234, ptr addrspace(2) @addrspace_addrdisc)
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index b374924516d66..2a846e036924c 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -217,6 +217,10 @@ declare void @g.f1()
 ; CHECK: @g.sanitize_address_dyninit = global i32 0, sanitize_address_dyninit
 ; CHECK: @g.sanitize_multiple = global i32 0, sanitize_memtag, sanitize_address_dyninit
 
+; ptrauth constant
+@auth_var = global ptr ptrauth (ptr @g1, i32 0, i64 65535, ptr null)
+; CHECK: @auth_var = global ptr ptrauth (ptr @g1, i32 0, i64 65535)
+
 ;; Aliases
 ; Format: @<Name> = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal]
 ;                   [unnamed_addr] alias <AliaseeTy> @<Aliasee>
diff --git a/llvm/test/Bitcode/value-with-long-name-dbg.ll b/llvm/test/Bitcode/value-with-long-name-dbg.ll
new file mode 100644
index 0000000000000..0cc3569d8617b
--- /dev/null
+++ b/llvm/test/Bitcode/value-with-long-name-dbg.ll
@@ -0,0 +1,11 @@
+; REQUIRES: asserts
+; Force the size to be small to check assertion message.
+; RUN: not --crash opt -S %s -O2 -o - -non-global-value-max-name-size=0 2>&1 | FileCheck %s
+; CHECK: Can't generate unique name: MaxNameSize is too small.
+
+define i32 @f(i32 %a, i32 %b) {
+  %c = add i32 %a, %b
+  %d = add i32 %c, %a
+  %e = add i32 %d, %b
+  ret i32 %e
+}
diff --git a/llvm/test/Bitcode/value-with-long-name.ll b/llvm/test/Bitcode/value-with-long-name.ll
index 1ca5d133e09ae..aa7da5f5b7dba 100644
--- a/llvm/test/Bitcode/value-with-long-name.ll
+++ b/llvm/test/Bitcode/value-with-long-name.ll
@@ -1,10 +1,10 @@
 ; Check the size of generated variable when no option is set
 ; RUN: opt -S %s -O2 -o - | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=-1 | FileCheck -check-prefix=CHECK-LONG %s
 ; CHECK-LONG: %{{[a-z]{4}[a-z]+}}
 
 ; Then check we correctly cap the size of newly generated non-global values name
 ; Force the size to be small so that the check works on release and debug build
-; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=0 | FileCheck -check-prefix=CHECK-SHORT %s
 ; RUN: opt -S %s -O2 -o - -non-global-value-max-name-size=1 | FileCheck -check-prefix=CHECK-SHORT %s
 ; CHECK-SHORT-NOT: %{{[a-z][a-z]+}}
 
@@ -14,5 +14,3 @@ define i32 @f(i32 %a, i32 %b) {
   %e = add i32 %d, %b
   ret i32 %e
 }
-
-
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index c942339e43608..2f466c258f677 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -26,6 +26,7 @@ llvm_canonicalize_cmake_booleans(
   LLVM_TOOL_LLVM_DRIVER_BUILD
   LLVM_INCLUDE_SPIRV_TOOLS_TESTS
   LLVM_APPEND_VC_REV
+  LLVM_HAS_LOGF128
   )
 
 configure_lit_site_cfg(
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll
index e7e231bc344d9..3732d4feb0c67 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll
@@ -566,6 +566,119 @@ define dso_local i128 @load_atomic_i128_unaligned_seq_cst_const(ptr readonly %pt
     %r = load atomic i128, ptr %ptr seq_cst, align 1
     ret i128 %r
 }
+
+define dso_local half @load_atomic_f16_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_unordered:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic half, ptr %ptr unordered, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_unordered_const:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic half, ptr %ptr unordered, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_monotonic:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic half, ptr %ptr monotonic, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_monotonic_const:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic half, ptr %ptr monotonic, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_acquire:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic half, ptr %ptr acquire, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_acquire_const:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic half, ptr %ptr acquire, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_seq_cst:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic half, ptr %ptr seq_cst, align 2
+    ret half %r
+}
+
+define dso_local half @load_atomic_f16_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_f16_aligned_seq_cst_const:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic half, ptr %ptr seq_cst, align 2
+    ret half %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_unordered:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr unordered, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_unordered_const:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr unordered, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_monotonic:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr monotonic, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_monotonic_const:
+; CHECK:    ldrh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr monotonic, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_acquire:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr acquire, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_acquire_const:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr acquire, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr seq_cst, align 2
+    ret bfloat %r
+}
+
+define dso_local bfloat @load_atomic_bf16_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst_const:
+; CHECK:    ldarh w8, [x0]
+    %r = load atomic bfloat, ptr %ptr seq_cst, align 2
+    ret bfloat %r
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; -O0: {{.*}}
 ; -O1: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir
index fad3655da9d01..a0142afd06777 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add.mir
@@ -207,3 +207,126 @@ body:             |
     %3:_(<4 x s32>) = G_FADD %0, %2(<4 x s32>)
     $q0 = COPY %3(<4 x s32>)
 ...
+---
+name:            saddl_v8i8_v8i32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: saddl_v8i8_v8i32
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[SEXT]], [[SEXT1]]
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[ADD]](<8 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT2]](<8 x s32>)
+    ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:_(<8 x s8>) = COPY $d0
+    %1:_(<8 x s8>) = COPY $d1
+    %2:_(<8 x s32>) = G_SEXT %0(<8 x s8>)
+    %3:_(<8 x s32>) = G_SEXT %1(<8 x s8>)
+    %4:_(<8 x s32>) = G_ADD %2, %3
+    %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>)
+    $q0 = COPY %5(<4 x s32>)
+    $q1 = COPY %6(<4 x s32>)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
+
+---
+name:            uaddl_v8i8_v8i32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: uaddl_v8i8_v8i32
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[ZEXT]], [[ZEXT1]]
+    ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(<8 x s32>) = G_ZEXT [[ADD]](<8 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ZEXT2]](<8 x s32>)
+    ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:_(<8 x s8>) = COPY $d0
+    %1:_(<8 x s8>) = COPY $d1
+    %2:_(<8 x s32>) = G_ZEXT %0(<8 x s8>)
+    %3:_(<8 x s32>) = G_ZEXT %1(<8 x s8>)
+    %4:_(<8 x s32>) = G_ADD %2, %3
+    %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>)
+    $q0 = COPY %5(<4 x s32>)
+    $q1 = COPY %6(<4 x s32>)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
+
+---
+name:            ssubl_v8i8_v8i32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: ssubl_v8i8_v8i32
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(<8 x s16>) = G_SEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[SEXT]], [[SEXT1]]
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[SUB]](<8 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT2]](<8 x s32>)
+    ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:_(<8 x s8>) = COPY $d0
+    %1:_(<8 x s8>) = COPY $d1
+    %2:_(<8 x s32>) = G_SEXT %0(<8 x s8>)
+    %3:_(<8 x s32>) = G_SEXT %1(<8 x s8>)
+    %4:_(<8 x s32>) = G_SUB %2, %3
+    %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>)
+    $q0 = COPY %5(<4 x s32>)
+    $q1 = COPY %6(<4 x s32>)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
+
+---
+name:            usubl_v8i8_v8i32
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: usubl_v8i8_v8i32
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s8>) = COPY $d1
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY]](<8 x s8>)
+    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(<8 x s16>) = G_ZEXT [[COPY1]](<8 x s8>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[ZEXT]], [[ZEXT1]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<8 x s32>) = G_SEXT [[SUB]](<8 x s16>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[SEXT]](<8 x s32>)
+    ; CHECK-NEXT: $q0 = COPY [[UV]](<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY [[UV1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:_(<8 x s8>) = COPY $d0
+    %1:_(<8 x s8>) = COPY $d1
+    %2:_(<8 x s32>) = G_ZEXT %0(<8 x s8>)
+    %3:_(<8 x s32>) = G_ZEXT %1(<8 x s8>)
+    %4:_(<8 x s32>) = G_SUB %2, %3
+    %5:_(<4 x s32>), %6:_(<4 x s32>) = G_UNMERGE_VALUES %4(<8 x s32>)
+    $q0 = COPY %5(<4 x s32>)
+    $q1 = COPY %6(<4 x s32>)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 94b792b887eb4..def4192b0e005 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -94,18 +94,19 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias
 ;
 ; GISEL-LABEL: oversized_ADDV_256:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    ldr d0, [x0]
-; GISEL-NEXT:    ldr d1, [x1]
-; GISEL-NEXT:    ushll v0.8h, v0.8b, #0
-; GISEL-NEXT:    ushll v1.8h, v1.8b, #0
-; GISEL-NEXT:    usubl v2.4s, v0.4h, v1.4h
-; GISEL-NEXT:    usubl2 v0.4s, v0.8h, v1.8h
-; GISEL-NEXT:    cmlt v1.4s, v2.4s, #0
-; GISEL-NEXT:    cmlt v3.4s, v0.4s, #0
-; GISEL-NEXT:    neg v4.4s, v2.4s
-; GISEL-NEXT:    neg v5.4s, v0.4s
-; GISEL-NEXT:    bsl v1.16b, v4.16b, v2.16b
-; GISEL-NEXT:    bit v0.16b, v5.16b, v3.16b
+; GISEL-NEXT:    ldr d1, [x0]
+; GISEL-NEXT:    ldr d2, [x1]
+; GISEL-NEXT:    movi v0.2d, #0000000000000000
+; GISEL-NEXT:    usubl v1.8h, v1.8b, v2.8b
+; GISEL-NEXT:    sshll v2.4s, v1.4h, #0
+; GISEL-NEXT:    sshll2 v3.4s, v1.8h, #0
+; GISEL-NEXT:    ssubw2 v0.4s, v0.4s, v1.8h
+; GISEL-NEXT:    cmlt v4.4s, v2.4s, #0
+; GISEL-NEXT:    cmlt v5.4s, v3.4s, #0
+; GISEL-NEXT:    neg v6.4s, v2.4s
+; GISEL-NEXT:    mov v1.16b, v4.16b
+; GISEL-NEXT:    bif v0.16b, v3.16b, v5.16b
+; GISEL-NEXT:    bsl v1.16b, v6.16b, v2.16b
 ; GISEL-NEXT:    add v0.4s, v1.4s, v0.4s
 ; GISEL-NEXT:    addv s0, v0.4s
 ; GISEL-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
index b89232c03f136..44b92e6ccd088 100644
--- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
 ; CHECK-LABEL: tbl1_8b:
@@ -20,175 +21,378 @@ define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
 }
 
 define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
-; CHECK-LABEL: tbl2_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl.8b v0, { v0, v1 }, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl2_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1 }, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl2_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-; CHECK-LABEL: tbl2_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl.16b v0, { v0, v1 }, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl2_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl2_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK-LABEL: tbl3_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    tbl.8b v0, { v0, v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl3_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl3_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK-LABEL: tbl3_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl3_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl3_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK-LABEL: tbl4_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.8b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl4_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl4_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK-LABEL: tbl4_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbl4_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbl4_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
   ret <16 x i8> %tmp3
 }
 
-; CHECK-LABEL: .LCPI8_0:
-; CHECK-NEXT:     .byte    0                               // 0x0
-; CHECK-NEXT:     .byte    4                               // 0x4
-; CHECK-NEXT:     .byte    8                               // 0x8
-; CHECK-NEXT:     .byte    12                              // 0xc
-; CHECK-NEXT:     .byte    255                             // 0xff
-; CHECK-NEXT:     .byte    255                             // 0xff
-; CHECK-NEXT:     .byte    255                             // 0xff
-; CHECK-NEXT:     .byte    255                             // 0xff
+; CHECK-SD-LABEL: .LCPI8_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   8                               // 0x8
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+
+; CHECK-GI-LABEL: .LCPI8_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   13                              // 0xd
+; CHECK-GI-NEXT:         .byte   14                              // 0xe
+; CHECK-GI-NEXT:         .byte   15                              // 0xf
+; CHECK-GI-LABEL: .LCPI8_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI8_0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    tbl.8b v0, { v0, v1 }, v4
-; CHECK-NEXT:    tbl.8b v1, { v2, v3 }, v4
-; CHECK-NEXT:    mov.s v0[1], v1[1]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    adrp x8, .LCPI8_0
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    ldr d4, [x8, :lo12:.LCPI8_0]
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    tbl.8b v0, { v0, v1 }, v4
+; CHECK-SD-NEXT:    tbl.8b v1, { v2, v3 }, v4
+; CHECK-SD-NEXT:    mov.s v0[1], v1[1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI8_1
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr d4, [x8, :lo12:.LCPI8_1]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    adrp x8, .LCPI8_0
+; CHECK-GI-NEXT:    tbl.8b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.8b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %c, <16 x i8> %d, <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <8 x i8> %t1, <8 x i8> %t2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i8> %s
 }
 
-; CHECK-LABEL: .LCPI9_0:
-; CHECK-NEXT:     .byte    0                               // 0x0
-; CHECK-NEXT:     .byte    4                               // 0x4
-; CHECK-NEXT:     .byte    8                               // 0x8
-; CHECK-NEXT:     .byte    12                              // 0xc
-; CHECK-NEXT:     .byte    16                              // 0x10
-; CHECK-NEXT:     .byte    20                              // 0x14
-; CHECK-NEXT:     .byte    24                              // 0x18
-; CHECK-NEXT:     .byte    28                              // 0x1c
-; CHECK-NEXT:     .byte   32                              // 0x20
-; CHECK-NEXT:     .byte   36                              // 0x24
-; CHECK-NEXT:     .byte   40                              // 0x28
-; CHECK-NEXT:     .byte   44                              // 0x2c
-; CHECK-NEXT:     .byte   48                              // 0x30
-; CHECK-NEXT:     .byte   52                              // 0x34
-; CHECK-NEXT:     .byte   56                              // 0x38
-; CHECK-NEXT:     .byte   60                              // 0x3c
+; CHECK-SD-LABEL: .LCPI9_0:
+; CHECK-SD-NEXT:     .byte    0                               // 0x0
+; CHECK-SD-NEXT:     .byte    4                               // 0x4
+; CHECK-SD-NEXT:     .byte    8                               // 0x8
+; CHECK-SD-NEXT:     .byte    12                              // 0xc
+; CHECK-SD-NEXT:     .byte    16                              // 0x10
+; CHECK-SD-NEXT:     .byte    20                              // 0x14
+; CHECK-SD-NEXT:     .byte    24                              // 0x18
+; CHECK-SD-NEXT:     .byte    28                              // 0x1c
+; CHECK-SD-NEXT:     .byte   32                              // 0x20
+; CHECK-SD-NEXT:     .byte   36                              // 0x24
+; CHECK-SD-NEXT:     .byte   40                              // 0x28
+; CHECK-SD-NEXT:     .byte   44                              // 0x2c
+; CHECK-SD-NEXT:     .byte   48                              // 0x30
+; CHECK-SD-NEXT:     .byte   52                              // 0x34
+; CHECK-SD-NEXT:     .byte   56                              // 0x38
+; CHECK-SD-NEXT:     .byte   60                              // 0x3c
+
+;CHECK-GI-LABEL: .LCPI9_0:
+;CHECK-GI:              .byte   0                               // 0x0
+;CHECK-GI-NEXT:         .byte   1                               // 0x1
+;CHECK-GI-NEXT:         .byte   2                               // 0x2
+;CHECK-GI-NEXT:         .byte   3                               // 0x3
+;CHECK-GI-NEXT:         .byte   4                               // 0x4
+;CHECK-GI-NEXT:         .byte   5                               // 0x5
+;CHECK-GI-NEXT:         .byte   6                               // 0x6
+;CHECK-GI-NEXT:         .byte   7                               // 0x7
+;CHECK-GI-NEXT:         .byte   16                              // 0x10
+;CHECK-GI-NEXT:         .byte   17                              // 0x11
+;CHECK-GI-NEXT:         .byte   18                              // 0x12
+;CHECK-GI-NEXT:         .byte   19                              // 0x13
+;CHECK-GI-NEXT:         .byte   20                              // 0x14
+;CHECK-GI-NEXT:         .byte   21                              // 0x15
+;CHECK-GI-NEXT:         .byte   22                              // 0x16
+;CHECK-GI-NEXT:         .byte   23                              // 0x17
+;CHECK-GI-LABEL: .LCPI9_1:
+;CHECK-GI:              .byte   0                               // 0x0
+;CHECK-GI-NEXT:         .byte   4                               // 0x4
+;CHECK-GI-NEXT:         .byte   8                               // 0x8
+;CHECK-GI-NEXT:         .byte   12                              // 0xc
+;CHECK-GI-NEXT:         .byte   16                              // 0x10
+;CHECK-GI-NEXT:         .byte   20                              // 0x14
+;CHECK-GI-NEXT:         .byte   24                              // 0x18
+;CHECK-GI-NEXT:         .byte   28                              // 0x1c
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
+;CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI9_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI9_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI9_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI9_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI9_1
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI9_1]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    adrp x8, .LCPI9_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI9_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i8> %s
 }
 
+; CHECK-GI-LABEL: .LCPI10_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI10_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s4, w0
-; CHECK-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    mov.b v4[1], w0
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    mov.b v4[2], w0
-; CHECK-NEXT:    mov.b v4[3], w0
-; CHECK-NEXT:    mov.b v4[4], w0
-; CHECK-NEXT:    mov.b v4[5], w0
-; CHECK-NEXT:    mov.b v4[6], w0
-; CHECK-NEXT:    mov.b v4[7], w0
-; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov w8, #36 // =0x24
-; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov w8, #40 // =0x28
-; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov w8, #44 // =0x2c
-; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov w8, #48 // =0x30
-; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov w8, #52 // =0x34
-; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    mov w8, #56 // =0x38
-; CHECK-NEXT:    mov.b v4[14], w8
-; CHECK-NEXT:    mov w8, #60 // =0x3c
-; CHECK-NEXT:    mov.b v4[15], w8
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s4, w0
+; CHECK-SD-NEXT:    mov w8, #32 // =0x20
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    mov.b v4[1], w0
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    mov.b v4[2], w0
+; CHECK-SD-NEXT:    mov.b v4[3], w0
+; CHECK-SD-NEXT:    mov.b v4[4], w0
+; CHECK-SD-NEXT:    mov.b v4[5], w0
+; CHECK-SD-NEXT:    mov.b v4[6], w0
+; CHECK-SD-NEXT:    mov.b v4[7], w0
+; CHECK-SD-NEXT:    mov.b v4[8], w8
+; CHECK-SD-NEXT:    mov w8, #36 // =0x24
+; CHECK-SD-NEXT:    mov.b v4[9], w8
+; CHECK-SD-NEXT:    mov w8, #40 // =0x28
+; CHECK-SD-NEXT:    mov.b v4[10], w8
+; CHECK-SD-NEXT:    mov w8, #44 // =0x2c
+; CHECK-SD-NEXT:    mov.b v4[11], w8
+; CHECK-SD-NEXT:    mov w8, #48 // =0x30
+; CHECK-SD-NEXT:    mov.b v4[12], w8
+; CHECK-SD-NEXT:    mov w8, #52 // =0x34
+; CHECK-SD-NEXT:    mov.b v4[13], w8
+; CHECK-SD-NEXT:    mov w8, #56 // =0x38
+; CHECK-SD-NEXT:    mov.b v4[14], w8
+; CHECK-SD-NEXT:    mov w8, #60 // =0x3c
+; CHECK-SD-NEXT:    mov.b v4[15], w8
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI10_1
+; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI10_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI10_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v5
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI10_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
@@ -211,40 +415,111 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
   ret <16 x i8> %s
 }
 
+; CHECK-GI-LABEL: .LCPI11_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   15                              // 0xf
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   31                              // 0x1f
+; CHECK-GI-LABEL: .LCPI11_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    fmov s4, w8
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    mov.b v4[1], w8
-; CHECK-NEXT:    mov.b v4[2], w8
-; CHECK-NEXT:    mov.b v4[3], w8
-; CHECK-NEXT:    mov.b v4[4], w8
-; CHECK-NEXT:    mov.b v4[5], w8
-; CHECK-NEXT:    mov.b v4[6], w8
-; CHECK-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEXT:    mov.b v4[7], w0
-; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov w8, #36 // =0x24
-; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov w8, #40 // =0x28
-; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov w8, #44 // =0x2c
-; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov w8, #48 // =0x30
-; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov w8, #52 // =0x34
-; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    mov w8, #56 // =0x38
-; CHECK-NEXT:    mov.b v4[14], w8
-; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    mov.b v4[15], w8
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    fmov s4, w8
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    mov.b v4[1], w8
+; CHECK-SD-NEXT:    mov.b v4[2], w8
+; CHECK-SD-NEXT:    mov.b v4[3], w8
+; CHECK-SD-NEXT:    mov.b v4[4], w8
+; CHECK-SD-NEXT:    mov.b v4[5], w8
+; CHECK-SD-NEXT:    mov.b v4[6], w8
+; CHECK-SD-NEXT:    mov w8, #32 // =0x20
+; CHECK-SD-NEXT:    mov.b v4[7], w0
+; CHECK-SD-NEXT:    mov.b v4[8], w8
+; CHECK-SD-NEXT:    mov w8, #36 // =0x24
+; CHECK-SD-NEXT:    mov.b v4[9], w8
+; CHECK-SD-NEXT:    mov w8, #40 // =0x28
+; CHECK-SD-NEXT:    mov.b v4[10], w8
+; CHECK-SD-NEXT:    mov w8, #44 // =0x2c
+; CHECK-SD-NEXT:    mov.b v4[11], w8
+; CHECK-SD-NEXT:    mov w8, #48 // =0x30
+; CHECK-SD-NEXT:    mov.b v4[12], w8
+; CHECK-SD-NEXT:    mov w8, #52 // =0x34
+; CHECK-SD-NEXT:    mov.b v4[13], w8
+; CHECK-SD-NEXT:    mov w8, #56 // =0x38
+; CHECK-SD-NEXT:    mov.b v4[14], w8
+; CHECK-SD-NEXT:    mov w8, #31 // =0x1f
+; CHECK-SD-NEXT:    mov.b v4[15], w8
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    fmov s6, w0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI11_1
+; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI11_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI11_0
+; CHECK-GI-NEXT:    mov.b v5[15], v6[0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v5
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 1, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 1, i32 2
@@ -267,29 +542,116 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
   ret <16 x i8> %s
 }
 
+; CHECK-SD-LABEL: .LCPI12_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   8                               // 0x8
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+
+; CHECK-GI-LABEL: .LCPI12_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI12_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.2d v4, #0xffffffffffffffff
-; CHECK-NEXT:    adrp x8, .LCPI12_0
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    tbl.16b v2, { v2, v3 }, v5
-; CHECK-NEXT:    mov.b v4[0], w0
-; CHECK-NEXT:    mov.b v4[1], w0
-; CHECK-NEXT:    mov.b v4[2], w0
-; CHECK-NEXT:    mov.b v4[3], w0
-; CHECK-NEXT:    mov.b v4[4], w0
-; CHECK-NEXT:    mov.b v4[5], w0
-; CHECK-NEXT:    mov.b v4[6], w0
-; CHECK-NEXT:    mov.b v4[7], w0
-; CHECK-NEXT:    tbl.16b v0, { v0, v1 }, v4
-; CHECK-NEXT:    mov.d v2[1], v0[0]
-; CHECK-NEXT:    mov.16b v0, v2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    movi.2d v4, #0xffffffffffffffff
+; CHECK-SD-NEXT:    adrp x8, .LCPI12_0
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    ldr q5, [x8, :lo12:.LCPI12_0]
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-SD-NEXT:    mov.b v4[0], w0
+; CHECK-SD-NEXT:    mov.b v4[1], w0
+; CHECK-SD-NEXT:    mov.b v4[2], w0
+; CHECK-SD-NEXT:    mov.b v4[3], w0
+; CHECK-SD-NEXT:    mov.b v4[4], w0
+; CHECK-SD-NEXT:    mov.b v4[5], w0
+; CHECK-SD-NEXT:    mov.b v4[6], w0
+; CHECK-SD-NEXT:    mov.b v4[7], w0
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-SD-NEXT:    mov.d v2[1], v0[0]
+; CHECK-SD-NEXT:    mov.16b v0, v2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI12_1
+; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI12_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI12_0
+; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v4
+; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v2, v3 }, v0
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
@@ -312,29 +674,133 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
   ret <16 x i8> %s
 }
 
+; CHECK-SD-LABEL: .LCPI13_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   8                               // 0x8
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-NEXT:         .byte   255                             // 0xff
+; CHECK-SD-LABEL: .LCPI13_1:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   1                               // 0x1
+; CHECK-SD-NEXT:         .byte   2                               // 0x2
+; CHECK-SD-NEXT:         .byte   3                               // 0x3
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   5                               // 0x5
+; CHECK-SD-NEXT:         .byte   6                               // 0x6
+; CHECK-SD-NEXT:         .byte   7                               // 0x7
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   17                              // 0x11
+; CHECK-SD-NEXT:         .byte   18                              // 0x12
+; CHECK-SD-NEXT:         .byte   19                              // 0x13
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   21                              // 0x15
+; CHECK-SD-NEXT:         .byte   30                              // 0x1e
+; CHECK-SD-NEXT:         .byte   31                              // 0x1f
+
+; CHECK-GI-LABEL: .LCPI13_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   2                               // 0x2
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   30                              // 0x1e
+; CHECK-GI-NEXT:         .byte   31                              // 0x1f
+; CHECK-GI-LABEL: .LCPI13_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup.16b v4, w0
-; CHECK-NEXT:    mov w8, #255 // =0xff
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    adrp x8, .LCPI13_0
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT:    adrp x8, .LCPI13_1
-; CHECK-NEXT:    tbl.16b v2, { v2, v3 }, v5
-; CHECK-NEXT:    tbl.16b v3, { v0, v1 }, v4
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI13_1]
-; CHECK-NEXT:    tbl.16b v0, { v2, v3 }, v0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup.16b v4, w0
+; CHECK-SD-NEXT:    mov w8, #255 // =0xff
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-SD-NEXT:    mov.b v4[8], w8
+; CHECK-SD-NEXT:    mov.b v4[9], w8
+; CHECK-SD-NEXT:    mov.b v4[10], w8
+; CHECK-SD-NEXT:    mov.b v4[11], w8
+; CHECK-SD-NEXT:    mov.b v4[12], w8
+; CHECK-SD-NEXT:    mov.b v4[13], w8
+; CHECK-SD-NEXT:    adrp x8, .LCPI13_0
+; CHECK-SD-NEXT:    ldr q5, [x8, :lo12:.LCPI13_0]
+; CHECK-SD-NEXT:    adrp x8, .LCPI13_1
+; CHECK-SD-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-SD-NEXT:    tbl.16b v3, { v0, v1 }, v4
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI13_1]
+; CHECK-SD-NEXT:    tbl.16b v0, { v2, v3 }, v0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    fmov s6, w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI13_1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov.16b v5, v4
+; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[8], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[9], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[10], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[11], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[12], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[13], v6[0]
+; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
+; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI13_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
+; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v4
+; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI13_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v2, v3 }, v0
+; CHECK-GI-NEXT:    ret
   %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
   %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
   %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
@@ -357,106 +823,293 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16
   ret <16 x i8> %s
 }
 
+; CHECK-SD-LABEL: .LCPI14_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   32                              // 0x20
+; CHECK-SD-NEXT:         .byte   36                              // 0x24
+; CHECK-SD-NEXT:         .byte   40                              // 0x28
+; CHECK-SD-NEXT:         .byte   44                              // 0x2c
+; CHECK-SD-NEXT:         .byte   48                              // 0x30
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   56                              // 0x38
+; CHECK-SD-NEXT:         .byte   60                              // 0x3c
 
-; CHECK-LABEL: .LCPI14_0:
-; CHECK-NEXT:	.byte	0                               // 0x0
-; CHECK-NEXT:	.byte	4                               // 0x4
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	12                              // 0xc
-; CHECK-NEXT:	.byte	16                              // 0x10
-; CHECK-NEXT:	.byte	20                              // 0x14
-; CHECK-NEXT:	.byte	24                              // 0x18
-; CHECK-NEXT:	.byte	28                              // 0x1c
-; CHECK-NEXT:	.byte	32                              // 0x20
-; CHECK-NEXT:	.byte	36                              // 0x24
-; CHECK-NEXT:	.byte	40                              // 0x28
-; CHECK-NEXT:	.byte	44                              // 0x2c
-; CHECK-NEXT:	.byte	48                              // 0x30
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	56                              // 0x38
-; CHECK-NEXT:	.byte	60                              // 0x3c
+; CHECK-GI-LABEL: .LCPI14_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI14_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI14_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_1
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI14_1]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i8> %s
 }
 
-; CHECK-LABEL: .LCPI15_0:
-; CHECK-NEXT:	.byte	0                               // 0x0
-; CHECK-NEXT:	.byte	4                               // 0x4
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	12                              // 0xc
-; CHECK-NEXT:	.byte	16                              // 0x10
-; CHECK-NEXT:	.byte	20                              // 0x14
-; CHECK-NEXT:	.byte	24                              // 0x18
-; CHECK-NEXT:	.byte	28                              // 0x1c
-; CHECK-NEXT:	.byte	32                              // 0x20
-; CHECK-NEXT:	.byte	36                              // 0x24
-; CHECK-NEXT:	.byte	40                              // 0x28
-; CHECK-NEXT:	.byte	44                              // 0x2c
-; CHECK-NEXT:	.byte	48                              // 0x30
-; CHECK-NEXT:	.byte	52                              // 0x34
-; CHECK-NEXT:	.byte	56                              // 0x38
-; CHECK-NEXT:	.byte	60                              // 0x3c
+; CHECK-SD-LABEL: .LCPI15_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   32                              // 0x20
+; CHECK-SD-NEXT:         .byte   36                              // 0x24
+; CHECK-SD-NEXT:         .byte   40                              // 0x28
+; CHECK-SD-NEXT:         .byte   44                              // 0x2c
+; CHECK-SD-NEXT:         .byte   48                              // 0x30
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   56                              // 0x38
+; CHECK-SD-NEXT:         .byte   60                              // 0x3c
+
+; CHECK-GI-LABEL: .LCPI15_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI15_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-LABEL: .LCPI15_2:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI15_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI15_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI15_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_2
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI15_2]
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI15_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v5
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI15_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i8> %s
 }
 
-; CHECK-LABEL: .LCPI16_0:
-; CHECK-NEXT: 	.byte	0                               // 0x0
-; CHECK-NEXT: 	.byte	4                               // 0x4
-; CHECK-NEXT: 	.byte	52                              // 0x34
-; CHECK-NEXT: 	.byte	12                              // 0xc
-; CHECK-NEXT: 	.byte	16                              // 0x10
-; CHECK-NEXT: 	.byte	20                              // 0x14
-; CHECK-NEXT: 	.byte	24                              // 0x18
-; CHECK-NEXT: 	.byte	28                              // 0x1c
-; CHECK-NEXT: 	.byte	32                              // 0x20
-; CHECK-NEXT: 	.byte	36                              // 0x24
-; CHECK-NEXT: 	.byte	40                              // 0x28
-; CHECK-NEXT: 	.byte	44                              // 0x2c
-; CHECK-NEXT: 	.byte	48                              // 0x30
-; CHECK-NEXT: 	.byte	52                              // 0x34
-; CHECK-NEXT: 	.byte	56                              // 0x38
-; CHECK-NEXT: 	.byte	60                              // 0x3c
+; CHECK-SD-LABEL: .LCPI16_0:
+; CHECK-SD:              .byte   0                               // 0x0
+; CHECK-SD-NEXT:         .byte   4                               // 0x4
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   12                              // 0xc
+; CHECK-SD-NEXT:         .byte   16                              // 0x10
+; CHECK-SD-NEXT:         .byte   20                              // 0x14
+; CHECK-SD-NEXT:         .byte   24                              // 0x18
+; CHECK-SD-NEXT:         .byte   28                              // 0x1c
+; CHECK-SD-NEXT:         .byte   32                              // 0x20
+; CHECK-SD-NEXT:         .byte   36                              // 0x24
+; CHECK-SD-NEXT:         .byte   40                              // 0x28
+; CHECK-SD-NEXT:         .byte   44                              // 0x2c
+; CHECK-SD-NEXT:         .byte   48                              // 0x30
+; CHECK-SD-NEXT:         .byte   52                              // 0x34
+; CHECK-SD-NEXT:         .byte   56                              // 0x38
+; CHECK-SD-NEXT:         .byte   60                              // 0x3c
+
+; CHECK-GI-LABEL: .LCPI16_0:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   1                               // 0x1
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   3                               // 0x3
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   5                               // 0x5
+; CHECK-GI-NEXT:         .byte   6                               // 0x6
+; CHECK-GI-NEXT:         .byte   7                               // 0x7
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   17                              // 0x11
+; CHECK-GI-NEXT:         .byte   18                              // 0x12
+; CHECK-GI-NEXT:         .byte   19                              // 0x13
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   21                              // 0x15
+; CHECK-GI-NEXT:         .byte   22                              // 0x16
+; CHECK-GI-NEXT:         .byte   23                              // 0x17
+; CHECK-GI-LABEL: .LCPI16_1:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-LABEL: .LCPI16_2:
+; CHECK-GI:              .byte   0                               // 0x0
+; CHECK-GI-NEXT:         .byte   4                               // 0x4
+; CHECK-GI-NEXT:         .byte   8                               // 0x8
+; CHECK-GI-NEXT:         .byte   12                              // 0xc
+; CHECK-GI-NEXT:         .byte   16                              // 0x10
+; CHECK-GI-NEXT:         .byte   20                              // 0x14
+; CHECK-GI-NEXT:         .byte   24                              // 0x18
+; CHECK-GI-NEXT:         .byte   28                              // 0x1c
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
+; CHECK-GI-NEXT:         .byte   255                             // 0xff
 
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    adrp x8, .LCPI16_0
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    adrp x8, .LCPI16_0
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    ldr q4, [x8, :lo12:.LCPI16_0]
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-SD-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI16_2
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI16_2]
+; CHECK-GI-NEXT:    adrp x8, .LCPI16_1
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI16_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI16_0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v5
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
+; CHECK-GI-NEXT:    ret
   %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 21, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
@@ -491,73 +1144,121 @@ define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
 }
 
 define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK-LABEL: tbx2_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    tbx.8b v0, { v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx2_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    tbx.8b v0, { v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx2_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    tbx.8b v0, { v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK-LABEL: tbx2_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
-; CHECK-NEXT:    tbx.16b v0, { v1, v2 }, v3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx2_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-SD-NEXT:    tbx.16b v0, { v1, v2 }, v3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx2_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2 def $q1_q2
+; CHECK-GI-NEXT:    tbx.16b v0, { v1, v2 }, v3
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK-LABEL: tbx3_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    tbx.8b v0, { v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx3_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    tbx.8b v0, { v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx3_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    tbx.8b v0, { v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK-LABEL: tbx3_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
-; CHECK-NEXT:    tbx.16b v0, { v1, v2, v3 }, v4
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx3_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-SD-NEXT:    tbx.16b v0, { v1, v2, v3 }, v4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx3_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3 def $q1_q2_q3
+; CHECK-GI-NEXT:    tbx.16b v0, { v1, v2, v3 }, v4
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
   ret <16 x i8> %tmp3
 }
 
 define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
-; CHECK-LABEL: tbx4_8b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    tbx.8b v0, { v1, v2, v3, v4 }, v5
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx4_8b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    tbx.8b v0, { v1, v2, v3, v4 }, v5
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx4_8b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    tbx.8b v0, { v1, v2, v3, v4 }, v5
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
   ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
-; CHECK-LABEL: tbx4_16b:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
-; CHECK-NEXT:    tbx.16b v0, { v1, v2, v3, v4 }, v5
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: tbx4_16b:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-SD-NEXT:    tbx.16b v0, { v1, v2, v3, v4 }, v5
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbx4_16b:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    // kill: def $q4 killed $q4 killed $q1_q2_q3_q4 def $q1_q2_q3_q4
+; CHECK-GI-NEXT:    tbx.16b v0, { v1, v2, v3, v4 }, v5
+; CHECK-GI-NEXT:    ret
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
   ret <16 x i8> %tmp3
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index f7d31a214563b..178c229d04e47 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -289,26 +289,27 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: uabd16b_rdx_i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushll.8h v2, v0, #0
-; CHECK-GI-NEXT:    ushll.8h v3, v1, #0
-; CHECK-GI-NEXT:    ushll2.8h v0, v0, #0
-; CHECK-GI-NEXT:    ushll2.8h v1, v1, #0
-; CHECK-GI-NEXT:    usubl.4s v4, v2, v3
-; CHECK-GI-NEXT:    usubl2.4s v2, v2, v3
-; CHECK-GI-NEXT:    usubl.4s v3, v0, v1
-; CHECK-GI-NEXT:    usubl2.4s v0, v0, v1
-; CHECK-GI-NEXT:    cmlt.4s v1, v4, #0
-; CHECK-GI-NEXT:    cmlt.4s v5, v2, #0
-; CHECK-GI-NEXT:    neg.4s v16, v4
-; CHECK-GI-NEXT:    cmlt.4s v6, v3, #0
-; CHECK-GI-NEXT:    cmlt.4s v7, v0, #0
-; CHECK-GI-NEXT:    neg.4s v17, v2
-; CHECK-GI-NEXT:    neg.4s v18, v3
-; CHECK-GI-NEXT:    neg.4s v19, v0
-; CHECK-GI-NEXT:    bsl.16b v1, v16, v4
-; CHECK-GI-NEXT:    bit.16b v2, v17, v5
-; CHECK-GI-NEXT:    bit.16b v3, v18, v6
-; CHECK-GI-NEXT:    bit.16b v0, v19, v7
+; CHECK-GI-NEXT:    usubl.8h v3, v0, v1
+; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-GI-NEXT:    usubl2.8h v0, v0, v1
+; CHECK-GI-NEXT:    sshll.4s v1, v3, #0
+; CHECK-GI-NEXT:    sshll2.4s v4, v3, #0
+; CHECK-GI-NEXT:    sshll.4s v5, v0, #0
+; CHECK-GI-NEXT:    sshll2.4s v6, v0, #0
+; CHECK-GI-NEXT:    ssubw2.4s v3, v2, v3
+; CHECK-GI-NEXT:    ssubw2.4s v0, v2, v0
+; CHECK-GI-NEXT:    cmlt.4s v2, v1, #0
+; CHECK-GI-NEXT:    cmlt.4s v7, v4, #0
+; CHECK-GI-NEXT:    neg.4s v16, v1
+; CHECK-GI-NEXT:    cmlt.4s v17, v5, #0
+; CHECK-GI-NEXT:    cmlt.4s v18, v6, #0
+; CHECK-GI-NEXT:    neg.4s v19, v5
+; CHECK-GI-NEXT:    bit.16b v1, v16, v2
+; CHECK-GI-NEXT:    mov.16b v2, v7
+; CHECK-GI-NEXT:    bif.16b v0, v6, v18
+; CHECK-GI-NEXT:    bsl.16b v2, v3, v4
+; CHECK-GI-NEXT:    mov.16b v3, v17
+; CHECK-GI-NEXT:    bsl.16b v3, v19, v5
 ; CHECK-GI-NEXT:    add.4s v1, v1, v2
 ; CHECK-GI-NEXT:    add.4s v0, v3, v0
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
@@ -336,26 +337,27 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: sabd16b_rdx_i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshll.8h v2, v0, #0
-; CHECK-GI-NEXT:    sshll.8h v3, v1, #0
-; CHECK-GI-NEXT:    sshll2.8h v0, v0, #0
-; CHECK-GI-NEXT:    sshll2.8h v1, v1, #0
-; CHECK-GI-NEXT:    ssubl.4s v4, v2, v3
-; CHECK-GI-NEXT:    ssubl2.4s v2, v2, v3
-; CHECK-GI-NEXT:    ssubl.4s v3, v0, v1
-; CHECK-GI-NEXT:    ssubl2.4s v0, v0, v1
-; CHECK-GI-NEXT:    cmlt.4s v1, v4, #0
-; CHECK-GI-NEXT:    cmlt.4s v5, v2, #0
-; CHECK-GI-NEXT:    neg.4s v16, v4
-; CHECK-GI-NEXT:    cmlt.4s v6, v3, #0
-; CHECK-GI-NEXT:    cmlt.4s v7, v0, #0
-; CHECK-GI-NEXT:    neg.4s v17, v2
-; CHECK-GI-NEXT:    neg.4s v18, v3
-; CHECK-GI-NEXT:    neg.4s v19, v0
-; CHECK-GI-NEXT:    bsl.16b v1, v16, v4
-; CHECK-GI-NEXT:    bit.16b v2, v17, v5
-; CHECK-GI-NEXT:    bit.16b v3, v18, v6
-; CHECK-GI-NEXT:    bit.16b v0, v19, v7
+; CHECK-GI-NEXT:    ssubl.8h v3, v0, v1
+; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-GI-NEXT:    ssubl2.8h v0, v0, v1
+; CHECK-GI-NEXT:    sshll.4s v1, v3, #0
+; CHECK-GI-NEXT:    sshll2.4s v4, v3, #0
+; CHECK-GI-NEXT:    sshll.4s v5, v0, #0
+; CHECK-GI-NEXT:    sshll2.4s v6, v0, #0
+; CHECK-GI-NEXT:    ssubw2.4s v3, v2, v3
+; CHECK-GI-NEXT:    ssubw2.4s v0, v2, v0
+; CHECK-GI-NEXT:    cmlt.4s v2, v1, #0
+; CHECK-GI-NEXT:    cmlt.4s v7, v4, #0
+; CHECK-GI-NEXT:    neg.4s v16, v1
+; CHECK-GI-NEXT:    cmlt.4s v17, v5, #0
+; CHECK-GI-NEXT:    cmlt.4s v18, v6, #0
+; CHECK-GI-NEXT:    neg.4s v19, v5
+; CHECK-GI-NEXT:    bit.16b v1, v16, v2
+; CHECK-GI-NEXT:    mov.16b v2, v7
+; CHECK-GI-NEXT:    bif.16b v0, v6, v18
+; CHECK-GI-NEXT:    bsl.16b v2, v3, v4
+; CHECK-GI-NEXT:    mov.16b v3, v17
+; CHECK-GI-NEXT:    bsl.16b v3, v19, v5
 ; CHECK-GI-NEXT:    add.4s v1, v1, v2
 ; CHECK-GI-NEXT:    add.4s v0, v3, v0
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll
index 5f293e5c7ea34..66fea3535b1ec 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-uinc-udec-wrap.ll
@@ -55,15 +55,15 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-LABEL: atomicrmw_uinc_wrap_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:  .LBB3_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldaxr x0, [x8]
-; CHECK-NEXT:    cmp x0, x1
-; CHECK-NEXT:    csinc x9, xzr, x0, hs
-; CHECK-NEXT:    stlxr w10, x9, [x8]
+; CHECK-NEXT:    ldaxr x8, [x0]
+; CHECK-NEXT:    cmp x8, x1
+; CHECK-NEXT:    csinc x9, xzr, x8, hs
+; CHECK-NEXT:    stlxr w10, x9, [x0]
 ; CHECK-NEXT:    cbnz w10, .LBB3_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECK-NEXT:    mov x0, x8
 ; CHECK-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i64 %val seq_cst
   ret i64 %result
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 83c7f73800af1..dfe0e83649e20 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -8,57 +8,57 @@ declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x
 define fastcc i8 @allocno_reload_assign() {
 ; CHECK-LABEL: allocno_reload_assign:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.b, #0 // =0x0
-; CHECK-NEXT:    mov z16.d, #0 // =0x0
+; CHECK-NEXT:    fmov d0, xzr
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    mov z16.d, #0 // =0x0
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    uzp1 p0.s, p0.s, p0.s
+; CHECK-NEXT:    uzp1 p0.h, p0.h, p0.h
+; CHECK-NEXT:    uzp1 p0.b, p0.b, p0.b
+; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
 ; CHECK-NEXT:    uunpkhi z0.h, z0.b
+; CHECK-NEXT:    whilelo p1.b, xzr, x8
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
 ; CHECK-NEXT:    uunpklo z5.s, z0.h
 ; CHECK-NEXT:    uunpkhi z7.s, z0.h
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    punpklo p2.h, p1.b
 ; CHECK-NEXT:    uunpklo z0.d, z2.s
 ; CHECK-NEXT:    uunpkhi z1.d, z2.s
+; CHECK-NEXT:    punpkhi p3.h, p1.b
 ; CHECK-NEXT:    uunpklo z2.d, z3.s
 ; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    punpklo p5.h, p0.b
 ; CHECK-NEXT:    uunpklo z4.d, z5.s
 ; CHECK-NEXT:    uunpkhi z5.d, z5.s
+; CHECK-NEXT:    punpkhi p7.h, p0.b
 ; CHECK-NEXT:    uunpklo z6.d, z7.s
 ; CHECK-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    fmov d17, xzr
-; CHECK-NEXT:    cmpeq p2.d, p0/z, z17.d, #0
-; CHECK-NEXT:    uzp1 p2.s, p2.s, p0.s
-; CHECK-NEXT:    uzp1 p2.h, p2.h, p0.h
-; CHECK-NEXT:    uzp1 p2.b, p2.b, p0.b
-; CHECK-NEXT:    mov z17.b, p2/z, #1 // =0x1
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    whilelo p2.b, xzr, x8
-; CHECK-NEXT:    not p2.b, p1/z, p2.b
-; CHECK-NEXT:    punpklo p3.h, p2.b
-; CHECK-NEXT:    punpkhi p2.h, p2.b
-; CHECK-NEXT:    punpklo p4.h, p3.b
-; CHECK-NEXT:    punpkhi p3.h, p3.b
-; CHECK-NEXT:    punpklo p5.h, p4.b
-; CHECK-NEXT:    punpkhi p4.h, p4.b
-; CHECK-NEXT:    st1b { z0.d }, p5, [z16.d]
-; CHECK-NEXT:    st1b { z1.d }, p4, [z16.d]
-; CHECK-NEXT:    punpklo p4.h, p3.b
+; CHECK-NEXT:    punpklo p0.h, p2.b
+; CHECK-NEXT:    punpkhi p1.h, p2.b
+; CHECK-NEXT:    punpklo p2.h, p3.b
 ; CHECK-NEXT:    punpkhi p3.h, p3.b
-; CHECK-NEXT:    st1b { z2.d }, p4, [z16.d]
+; CHECK-NEXT:    punpklo p4.h, p5.b
+; CHECK-NEXT:    punpkhi p5.h, p5.b
+; CHECK-NEXT:    punpklo p6.h, p7.b
+; CHECK-NEXT:    punpkhi p7.h, p7.b
+; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st1b { z0.d }, p0, [z16.d]
+; CHECK-NEXT:    st1b { z1.d }, p1, [z16.d]
+; CHECK-NEXT:    st1b { z2.d }, p2, [z16.d]
 ; CHECK-NEXT:    st1b { z3.d }, p3, [z16.d]
-; CHECK-NEXT:    punpklo p3.h, p2.b
-; CHECK-NEXT:    punpkhi p2.h, p2.b
-; CHECK-NEXT:    punpklo p4.h, p3.b
-; CHECK-NEXT:    punpkhi p3.h, p3.b
 ; CHECK-NEXT:    st1b { z4.d }, p4, [z16.d]
-; CHECK-NEXT:    st1b { z5.d }, p3, [z16.d]
-; CHECK-NEXT:    punpklo p3.h, p2.b
-; CHECK-NEXT:    punpkhi p2.h, p2.b
-; CHECK-NEXT:    st1b { z6.d }, p3, [z16.d]
-; CHECK-NEXT:    st1b { z7.d }, p2, [z16.d]
+; CHECK-NEXT:    st1b { z5.d }, p5, [z16.d]
+; CHECK-NEXT:    st1b { z6.d }, p6, [z16.d]
+; CHECK-NEXT:    st1b { z7.d }, p7, [z16.d]
 ; CHECK-NEXT:    b .LBB0_1
   br label %1
 
diff --git a/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
new file mode 100644
index 0000000000000..dff216192a6c3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fixed-point-conv-vec-pat.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64"
+
+; First some corner cases
+define <4 x float> @f_v4_s0(<4 x i32> %u) {
+; CHECK-LABEL: f_v4_s0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %s = ashr exact <4 x i32> %u, <i32 0, i32 0, i32 0, i32 0>
+  %v = sitofp <4 x i32> %s to <4 x float>
+  ret <4 x float> %v
+}
+
+define <4 x float> @f_v4_s1(<4 x i32> %u) {
+; CHECK-LABEL: f_v4_s1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s, #1
+; CHECK-NEXT:    ret
+  %s = ashr exact <4 x i32> %u, <i32 1, i32 1, i32 1, i32 1>
+  %v = sitofp <4 x i32> %s to <4 x float>
+  ret <4 x float> %v
+}
+
+define <4 x float> @f_v4_s24_inexact(<4 x i32> %u) {
+; CHECK-LABEL: f_v4_s24_inexact:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %s = ashr <4 x i32> %u, <i32 24, i32 24, i32 24, i32 24>
+  %v = sitofp <4 x i32> %s to <4 x float>
+  ret <4 x float> %v
+}
+
+define <4 x float> @f_v4_s31(<4 x i32> %u) {
+; CHECK-LABEL: f_v4_s31:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %s = ashr <4 x i32> %u, <i32 31, i32 31, i32 31, i32 31>
+  %v = sitofp <4 x i32> %s to <4 x float>
+  ret <4 x float> %v
+}
+
+; Common cases for conversion from signed integer to floating point types
+define <2 x float> @f_v2_s24(<2 x i32> %u) {
+; CHECK-LABEL: f_v2_s24:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2s, v0.2s, #24
+; CHECK-NEXT:    ret
+  %s = ashr exact <2 x i32> %u, <i32 24, i32 24>
+  %v = sitofp <2 x i32> %s to <2 x float>
+  ret <2 x float> %v
+}
+
+define <4 x float> @f_v4_s24(<4 x i32> %u) {
+; CHECK-LABEL: f_v4_s24:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s, #24
+; CHECK-NEXT:    ret
+  %s = ashr exact <4 x i32> %u, <i32 24, i32 24, i32 24, i32 24>
+  %v = sitofp <4 x i32> %s to <4 x float>
+  ret <4 x float> %v
+}
+
+; Check legalisation to <2 x f64> does not get in the way
+define <8 x double> @d_v8_s64(<8 x i64> %u) {
+; CHECK-LABEL: d_v8_s64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2d, v0.2d, #56
+; CHECK-NEXT:    scvtf v1.2d, v1.2d, #56
+; CHECK-NEXT:    scvtf v2.2d, v2.2d, #56
+; CHECK-NEXT:    scvtf v3.2d, v3.2d, #56
+; CHECK-NEXT:    ret
+  %s = ashr exact <8 x i64> %u, <i64 56, i64 56, i64 56, i64 56, i64 56, i64 56, i64 56, i64 56>
+  %v = sitofp <8 x i64> %s to <8 x double>
+  ret <8 x double> %v
+}
+
+define <4 x half> @h_v4_s8(<4 x i16> %u) #0 {
+; CHECK-LABEL: h_v4_s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4h, v0.4h, #8
+; CHECK-NEXT:    ret
+  %s = ashr exact <4 x i16> %u, <i16 8, i16 8, i16 8, i16 8>
+  %v = sitofp <4 x i16> %s to <4 x half>
+  ret <4 x half> %v
+}
+
+define <8 x half> @h_v8_s8(<8 x i16> %u) #0 {
+; CHECK-LABEL: h_v8_s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.8h, v0.8h, #8
+; CHECK-NEXT:    ret
+  %s = ashr exact <8 x i16> %u, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %v = sitofp <8 x i16> %s to <8 x half>
+  ret <8 x half> %v
+}
+
+attributes #0 = { "target-features"="+fullfp16"}
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll b/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll
index 6effc63ecc13c..fe3715341a25b 100644
--- a/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll
+++ b/llvm/test/CodeGen/AArch64/machine-sink-cache-invalidation.ll
@@ -9,20 +9,20 @@ target triple = "arm64-apple-macosx13.5.0"
 define i32 @nsis_BZ2_bzDecompress(ptr %pos.i, i1 %cmp661.not3117.i, i1 %exitcond.not.i) {
 ; CHECK-LABEL: nsis_BZ2_bzDecompress:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: // %while.end671.i
 ; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    strb w9, [x0]
 ; CHECK-NEXT:    tbnz w2, #0, .LBB0_4
 ; CHECK-NEXT:  .LBB0_2: // %for.body653.i
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x0]
 ; CHECK-NEXT:    tbnz w1, #0, .LBB0_1
 ; CHECK-NEXT:  // %bb.3: // %while.body663.i
 ; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    ldrb w9, [x9]
-; CHECK-NEXT:    strb wzr, [x0, x9]
+; CHECK-NEXT:    ldrb w10, [x8]
+; CHECK-NEXT:    strb wzr, [x0, x10]
 ; CHECK-NEXT:    b .LBB0_1
 ; CHECK-NEXT:  .LBB0_4: // %for.end677.i
 ; CHECK-NEXT:    mov w0, wzr
diff --git a/llvm/test/CodeGen/AArch64/mulcmle.ll b/llvm/test/CodeGen/AArch64/mulcmle.ll
index 5c216b8550080..32bc5c5e63b3e 100644
--- a/llvm/test/CodeGen/AArch64/mulcmle.ll
+++ b/llvm/test/CodeGen/AArch64/mulcmle.ll
@@ -1,11 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 %s -o - -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <1 x i64> @v1i64(<1 x i64> %a) {
-; CHECK-LABEL: v1i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v1i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v1i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    lsr x8, x8, #31
+; CHECK-GI-NEXT:    and x8, x8, #0x100000001
+; CHECK-GI-NEXT:    lsl x9, x8, #32
+; CHECK-GI-NEXT:    sub x8, x9, x8
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    ret
   %b = lshr <1 x i64> %a, <i64 31>
   %c = and <1 x i64> %b, <i64 4294967297>
   %d = mul nuw <1 x i64> %c, <i64 4294967295>
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 16200435c5c31..402682c89124b 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64 -o - | FileCheck %s
+; RUN: llc < %s -mtriple aarch64 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple aarch64 -o - -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <8 x i16> @extadds_v8i8_i16(<8 x i8> %s0, <8 x i8> %s1) {
 ; CHECK-LABEL: extadds_v8i8_i16:
@@ -26,12 +27,19 @@ entry:
 }
 
 define <16 x i16> @extadds_v16i8_i16(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: extadds_v16i8_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v16i8_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v16i8_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    saddl2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i16>
   %s1s = sext <16 x i8> %s1 to <16 x i16>
@@ -40,12 +48,19 @@ entry:
 }
 
 define <16 x i16> @extaddu_v16i8_i16(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: extaddu_v16i8_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v16i8_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v16i8_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uaddl2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i16>
   %s1s = zext <16 x i8> %s1 to <16 x i16>
@@ -54,16 +69,26 @@ entry:
 }
 
 define <32 x i16> @extadds_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) {
-; CHECK-LABEL: extadds_v32i8_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v4.8h, v1.16b, v3.16b
-; CHECK-NEXT:    saddl v5.8h, v0.8b, v2.8b
-; CHECK-NEXT:    saddl2 v6.8h, v0.16b, v2.16b
-; CHECK-NEXT:    saddl v2.8h, v1.8b, v3.8b
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v32i8_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v4.8h, v1.16b, v3.16b
+; CHECK-SD-NEXT:    saddl v5.8h, v0.8b, v2.8b
+; CHECK-SD-NEXT:    saddl2 v6.8h, v0.16b, v2.16b
+; CHECK-SD-NEXT:    saddl v2.8h, v1.8b, v3.8b
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v32i8_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v4.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    saddl2 v5.8h, v0.16b, v2.16b
+; CHECK-GI-NEXT:    saddl v2.8h, v1.8b, v3.8b
+; CHECK-GI-NEXT:    saddl2 v3.8h, v1.16b, v3.16b
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <32 x i8> %s0 to <32 x i16>
   %s1s = sext <32 x i8> %s1 to <32 x i16>
@@ -72,16 +97,26 @@ entry:
 }
 
 define <32 x i16> @extaddu_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) {
-; CHECK-LABEL: extaddu_v32i8_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v4.8h, v1.16b, v3.16b
-; CHECK-NEXT:    uaddl v5.8h, v0.8b, v2.8b
-; CHECK-NEXT:    uaddl2 v6.8h, v0.16b, v2.16b
-; CHECK-NEXT:    uaddl v2.8h, v1.8b, v3.8b
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v32i8_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v4.8h, v1.16b, v3.16b
+; CHECK-SD-NEXT:    uaddl v5.8h, v0.8b, v2.8b
+; CHECK-SD-NEXT:    uaddl2 v6.8h, v0.16b, v2.16b
+; CHECK-SD-NEXT:    uaddl v2.8h, v1.8b, v3.8b
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v32i8_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v4.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    uaddl2 v5.8h, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uaddl v2.8h, v1.8b, v3.8b
+; CHECK-GI-NEXT:    uaddl2 v3.8h, v1.16b, v3.16b
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <32 x i8> %s0 to <32 x i16>
   %s1s = zext <32 x i8> %s1 to <32 x i16>
@@ -90,12 +125,19 @@ entry:
 }
 
 define <8 x i32> @extadds_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
-; CHECK-LABEL: extadds_v8i8_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i32>
   %s1s = sext <8 x i8> %s1 to <8 x i32>
@@ -104,12 +146,19 @@ entry:
 }
 
 define <8 x i32> @extaddu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
-; CHECK-LABEL: extaddu_v8i8_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ushll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i8> %s0 to <8 x i32>
   %s1s = zext <8 x i8> %s1 to <8 x i32>
@@ -117,16 +166,68 @@ entry:
   ret <8 x i32> %m
 }
 
+define <8 x i32> @extsubs_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extsubs_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ssubl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ssubl v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i32>
+  %s1s = sext <8 x i8> %s1 to <8 x i32>
+  %m = sub <8 x i32> %s0s, %s1s
+  ret <8 x i32> %m
+}
+
+define <8 x i32> @extsubu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extsubu_v8i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v8i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    usubl v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <8 x i8> %s0 to <8 x i32>
+  %s1s = zext <8 x i8> %s1 to <8 x i32>
+  %m = sub <8 x i32> %s0s, %s1s
+  ret <8 x i32> %m
+}
+
 define <16 x i32> @extadds_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: extadds_v16i8_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    saddl2 v4.8h, v0.16b, v1.16b
-; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v16i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    saddl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v16i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    saddl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i32>
   %s1s = sext <16 x i8> %s1 to <16 x i32>
@@ -135,15 +236,25 @@ entry:
 }
 
 define <16 x i32> @extaddu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: extaddu_v16i8_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    uaddl2 v4.8h, v0.16b, v1.16b
-; CHECK-NEXT:    ushll v0.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    ushll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v16i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uaddl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ushll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    ushll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    ushll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    ushll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v16i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uaddl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ushll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
   %s1s = zext <16 x i8> %s1 to <16 x i32>
@@ -151,17 +262,82 @@ entry:
   ret <16 x i32> %m
 }
 
+define <16 x i32> @extsubs_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
+; CHECK-SD-LABEL: extsubs_v16i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ssubl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v16i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ssubl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <16 x i8> %s0 to <16 x i32>
+  %s1s = sext <16 x i8> %s1 to <16 x i32>
+  %m = sub <16 x i32> %s0s, %s1s
+  ret <16 x i32> %m
+}
+
+define <16 x i32> @extsubu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
+; CHECK-SD-LABEL: extsubu_v16i8_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    usubl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v16i8_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    usubl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <16 x i8> %s0 to <16 x i32>
+  %s1s = zext <16 x i8> %s1 to <16 x i32>
+  %m = sub <16 x i32> %s0s, %s1s
+  ret <16 x i32> %m
+}
+
 define <8 x i64> @extadds_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
-; CHECK-LABEL: extadds_v8i8_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.2d, v1.2s, #0
-; CHECK-NEXT:    sshll2 v3.2d, v2.4s, #0
-; CHECK-NEXT:    sshll2 v1.2d, v1.4s, #0
-; CHECK-NEXT:    sshll v2.2d, v2.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i64>
   %s1s = sext <8 x i8> %s1 to <8 x i64>
@@ -170,16 +346,27 @@ entry:
 }
 
 define <8 x i64> @extaddu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
-; CHECK-LABEL: extaddu_v8i8_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.2d, v1.2s, #0
-; CHECK-NEXT:    ushll2 v3.2d, v2.4s, #0
-; CHECK-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i8> %s0 to <8 x i64>
   %s1s = zext <8 x i8> %s1 to <8 x i64>
@@ -187,6 +374,384 @@ entry:
   ret <8 x i64> %m
 }
 
+define <8 x i64> @extsubs_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extsubs_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ssubl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ssubl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i8> %s0 to <8 x i64>
+  %s1s = sext <8 x i8> %s1 to <8 x i64>
+  %m = sub <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
+define <8 x i64> @extsubu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-SD-LABEL: extsubu_v8i8_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v8i8_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <8 x i8> %s0 to <8 x i64>
+  %s1s = zext <8 x i8> %s1 to <8 x i64>
+  %m = sub <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
+define <16 x i64> @extaddu_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: extaddu_v16i8_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ushll v3.4s, v2.4h, #0
+; CHECK-SD-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    ushll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    ushll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v16i8_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v7.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v5.2s, #0
+; CHECK-GI-NEXT:    ushll2 v5.2d, v5.4s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT:    ret
+    %c = zext <16 x i8> %a to <16 x i64>
+    %d = zext <16 x i8> %b to <16 x i64>
+    %e = add <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extadds_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: extadds_v16i8_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    saddl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v16i8_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    saddl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v7.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT:    ret
+    %c = sext <16 x i8> %a to <16 x i64>
+    %d = sext <16 x i8> %b to <16 x i64>
+    %e = add <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extsubu_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: extsubu_v16i8_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    usubl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v16i8_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    usubl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v7.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT:    ret
+    %c = zext <16 x i8> %a to <16 x i64>
+    %d = zext <16 x i8> %b to <16 x i64>
+    %e = sub <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extsubs_v16i8_i64(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: extsubs_v16i8_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ssubl2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v16i8_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ssubl2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v7.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT:    ret
+    %c = sext <16 x i8> %a to <16 x i64>
+    %d = sext <16 x i8> %b to <16 x i64>
+    %e = sub <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extaddu_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: extaddu_v16i16_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uaddl v5.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    uaddl v4.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    uaddl2 v2.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uaddl2 v6.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    ushll2 v1.2d, v4.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    ushll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    ushll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v16i16_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    uaddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uaddl v6.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    uaddl2 v7.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ushll v0.2d, v4.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v4.4s, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v5.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v5.4s, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT:    ushll2 v5.2d, v6.4s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT:    ret
+    %c = zext <16 x i16> %a to <16 x i64>
+    %d = zext <16 x i16> %b to <16 x i64>
+    %e = add <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extadds_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: extadds_v16i16_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    saddl v5.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    saddl v4.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    saddl2 v2.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    saddl2 v6.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v16i16_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    saddl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    saddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    saddl v6.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    saddl2 v7.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v6.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT:    ret
+    %c = sext <16 x i16> %a to <16 x i64>
+    %d = sext <16 x i16> %b to <16 x i64>
+    %e = add <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extsubu_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: extsubu_v16i16_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    usubl v5.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    usubl v4.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    usubl2 v2.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    usubl2 v6.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v16i16_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    usubl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    usubl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    usubl v6.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    usubl2 v7.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v6.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT:    ret
+    %c = zext <16 x i16> %a to <16 x i64>
+    %d = zext <16 x i16> %b to <16 x i64>
+    %e = sub <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
+define <16 x i64> @extsubs_v16i16_i64(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-SD-LABEL: extsubs_v16i16_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ssubl v5.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    ssubl v4.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    ssubl2 v2.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ssubl2 v6.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v16i16_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ssubl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ssubl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ssubl v6.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    ssubl2 v7.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v4.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v5.4s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll2 v5.2d, v6.4s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll2 v7.2d, v7.4s, #0
+; CHECK-GI-NEXT:    ret
+    %c = sext <16 x i16> %a to <16 x i64>
+    %d = sext <16 x i16> %b to <16 x i64>
+    %e = sub <16 x i64> %c, %d
+    ret <16 x i64> %e
+}
+
 define <4 x i32> @extadds_v4i16_i32(<4 x i16> %s0, <4 x i16> %s1) {
 ; CHECK-LABEL: extadds_v4i16_i32:
 ; CHECK:       // %bb.0: // %entry
@@ -212,12 +777,19 @@ entry:
 }
 
 define <8 x i32> @extadds_v8i16_i32(<8 x i16> %s0, <8 x i16> %s1) {
-; CHECK-LABEL: extadds_v8i16_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v8i16_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v8i16_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    saddl2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i16> %s0 to <8 x i32>
   %s1s = sext <8 x i16> %s1 to <8 x i32>
@@ -226,12 +798,19 @@ entry:
 }
 
 define <8 x i32> @extaddu_v8i16_i32(<8 x i16> %s0, <8 x i16> %s1) {
-; CHECK-LABEL: extaddu_v8i16_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v8i16_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v8i16_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uaddl2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i16> %s0 to <8 x i32>
   %s1s = zext <8 x i16> %s1 to <8 x i32>
@@ -240,16 +819,26 @@ entry:
 }
 
 define <16 x i32> @extadds_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) {
-; CHECK-LABEL: extadds_v16i16_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v4.4s, v1.8h, v3.8h
-; CHECK-NEXT:    saddl v5.4s, v0.4h, v2.4h
-; CHECK-NEXT:    saddl2 v6.4s, v0.8h, v2.8h
-; CHECK-NEXT:    saddl v2.4s, v1.4h, v3.4h
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v16i16_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v4.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    saddl v5.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    saddl2 v6.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    saddl v2.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v16i16_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    saddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    saddl v2.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    saddl2 v3.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i16> %s0 to <16 x i32>
   %s1s = sext <16 x i16> %s1 to <16 x i32>
@@ -258,16 +847,26 @@ entry:
 }
 
 define <16 x i32> @extaddu_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) {
-; CHECK-LABEL: extaddu_v16i16_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v4.4s, v1.8h, v3.8h
-; CHECK-NEXT:    uaddl v5.4s, v0.4h, v2.4h
-; CHECK-NEXT:    uaddl2 v6.4s, v0.8h, v2.8h
-; CHECK-NEXT:    uaddl v2.4s, v1.4h, v3.4h
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v16i16_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v4.4s, v1.8h, v3.8h
+; CHECK-SD-NEXT:    uaddl v5.4s, v0.4h, v2.4h
+; CHECK-SD-NEXT:    uaddl2 v6.4s, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uaddl v2.4s, v1.4h, v3.4h
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v16i16_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v4.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    uaddl2 v5.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uaddl v2.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    uaddl2 v3.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i16> %s0 to <16 x i32>
   %s1s = zext <16 x i16> %s1 to <16 x i32>
@@ -276,12 +875,19 @@ entry:
 }
 
 define <4 x i64> @extadds_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) {
-; CHECK-LABEL: extadds_v4i16_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    sshll2 v1.2d, v0.4s, #0
-; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v4i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshll2 v1.2d, v0.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v4i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    sshll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <4 x i16> %s0 to <4 x i64>
   %s1s = sext <4 x i16> %s1 to <4 x i64>
@@ -290,12 +896,19 @@ entry:
 }
 
 define <4 x i64> @extaddu_v4i16_i64(<4 x i16> %s0, <4 x i16> %s1) {
-; CHECK-LABEL: extaddu_v4i16_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v4i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v4i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <4 x i16> %s0 to <4 x i64>
   %s1s = zext <4 x i16> %s1 to <4 x i64>
@@ -304,15 +917,25 @@ entry:
 }
 
 define <8 x i64> @extadds_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
-; CHECK-LABEL: extadds_v8i16_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v2.4s, v0.4h, v1.4h
-; CHECK-NEXT:    saddl2 v4.4s, v0.8h, v1.8h
-; CHECK-NEXT:    sshll v0.2d, v2.2s, #0
-; CHECK-NEXT:    sshll2 v3.2d, v4.4s, #0
-; CHECK-NEXT:    sshll2 v1.2d, v2.4s, #0
-; CHECK-NEXT:    sshll v2.2d, v4.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v8i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    saddl2 v4.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v8i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    saddl2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i16> %s0 to <8 x i64>
   %s1s = sext <8 x i16> %s1 to <8 x i64>
@@ -321,15 +944,25 @@ entry:
 }
 
 define <8 x i64> @extaddu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
-; CHECK-LABEL: extaddu_v8i16_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v2.4s, v0.4h, v1.4h
-; CHECK-NEXT:    uaddl2 v4.4s, v0.8h, v1.8h
-; CHECK-NEXT:    ushll v0.2d, v2.2s, #0
-; CHECK-NEXT:    ushll2 v3.2d, v4.4s, #0
-; CHECK-NEXT:    ushll2 v1.2d, v2.4s, #0
-; CHECK-NEXT:    ushll v2.2d, v4.2s, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v8i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uaddl2 v4.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ushll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v8i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    uaddl2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ushll v0.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v2.4s, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i16> %s0 to <8 x i64>
   %s1s = zext <8 x i16> %s1 to <8 x i64>
@@ -337,6 +970,60 @@ entry:
   ret <8 x i64> %m
 }
 
+define <8 x i64> @extsubs_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
+; CHECK-SD-LABEL: extsubs_v8i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ssubl v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    ssubl2 v4.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubs_v8i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ssubl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ssubl2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = sext <8 x i16> %s0 to <8 x i64>
+  %s1s = sext <8 x i16> %s1 to <8 x i64>
+  %m = sub <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
+define <8 x i64> @extsubu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
+; CHECK-SD-LABEL: extsubu_v8i16_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v2.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    usubl2 v4.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extsubu_v8i16_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    usubl v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    usubl2 v3.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    ret
+entry:
+  %s0s = zext <8 x i16> %s0 to <8 x i64>
+  %s1s = zext <8 x i16> %s1 to <8 x i64>
+  %m = sub <8 x i64> %s0s, %s1s
+  ret <8 x i64> %m
+}
+
 define <2 x i64> @extadds_v2i32_i64(<2 x i32> %s0, <2 x i32> %s1) {
 ; CHECK-LABEL: extadds_v2i32_i64:
 ; CHECK:       // %bb.0: // %entry
@@ -362,12 +1049,19 @@ entry:
 }
 
 define <4 x i64> @extadds_v4i32_i64(<4 x i32> %s0, <4 x i32> %s1) {
-; CHECK-LABEL: extadds_v4i32_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v4i32_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v4i32_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    saddl2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <4 x i32> %s0 to <4 x i64>
   %s1s = sext <4 x i32> %s1 to <4 x i64>
@@ -376,12 +1070,19 @@ entry:
 }
 
 define <4 x i64> @extaddu_v4i32_i64(<4 x i32> %s0, <4 x i32> %s1) {
-; CHECK-LABEL: extaddu_v4i32_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v4i32_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v4i32_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    uaddl2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <4 x i32> %s0 to <4 x i64>
   %s1s = zext <4 x i32> %s1 to <4 x i64>
@@ -390,16 +1091,26 @@ entry:
 }
 
 define <8 x i64> @extadds_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) {
-; CHECK-LABEL: extadds_v8i32_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v4.2d, v1.4s, v3.4s
-; CHECK-NEXT:    saddl v5.2d, v0.2s, v2.2s
-; CHECK-NEXT:    saddl2 v6.2d, v0.4s, v2.4s
-; CHECK-NEXT:    saddl v2.2d, v1.2s, v3.2s
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extadds_v8i32_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v4.2d, v1.4s, v3.4s
+; CHECK-SD-NEXT:    saddl v5.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    saddl2 v6.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    saddl v2.2d, v1.2s, v3.2s
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extadds_v8i32_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddl v4.2d, v0.2s, v2.2s
+; CHECK-GI-NEXT:    saddl2 v5.2d, v0.4s, v2.4s
+; CHECK-GI-NEXT:    saddl v2.2d, v1.2s, v3.2s
+; CHECK-GI-NEXT:    saddl2 v3.2d, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i32> %s0 to <8 x i64>
   %s1s = sext <8 x i32> %s1 to <8 x i64>
@@ -408,16 +1119,26 @@ entry:
 }
 
 define <8 x i64> @extaddu_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) {
-; CHECK-LABEL: extaddu_v8i32_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v4.2d, v1.4s, v3.4s
-; CHECK-NEXT:    uaddl v5.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v6.2d, v0.4s, v2.4s
-; CHECK-NEXT:    uaddl v2.2d, v1.2s, v3.2s
-; CHECK-NEXT:    mov v0.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
-; CHECK-NEXT:    mov v3.16b, v4.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extaddu_v8i32_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v4.2d, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uaddl v5.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    uaddl2 v6.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    uaddl v2.2d, v1.2s, v3.2s
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
+; CHECK-SD-NEXT:    mov v1.16b, v6.16b
+; CHECK-SD-NEXT:    mov v3.16b, v4.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extaddu_v8i32_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v4.2d, v0.2s, v2.2s
+; CHECK-GI-NEXT:    uaddl2 v5.2d, v0.4s, v2.4s
+; CHECK-GI-NEXT:    uaddl v2.2d, v1.2s, v3.2s
+; CHECK-GI-NEXT:    uaddl2 v3.2d, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov v1.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <8 x i32> %s0 to <8 x i64>
   %s1s = zext <8 x i32> %s1 to <8 x i64>
@@ -426,17 +1147,33 @@ entry:
 }
 
 define <16 x i32> @add_zs(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: add_zs:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    saddw v2.8h, v2.8h, v1.8b
-; CHECK-NEXT:    saddw2 v4.8h, v0.8h, v1.16b
-; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_zs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT:    saddw v2.8h, v2.8h, v1.8b
+; CHECK-SD-NEXT:    saddw2 v4.8h, v0.8h, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_zs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v6.4s, v1.8h, #0
+; CHECK-GI-NEXT:    uaddw v0.4s, v0.4s, v3.4h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v2.4s, v3.8h
+; CHECK-GI-NEXT:    uaddw v2.4s, v5.4s, v4.4h
+; CHECK-GI-NEXT:    uaddw2 v3.4s, v6.4s, v4.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
   %s1s = sext <16 x i8> %s1 to <16 x i32>
@@ -445,87 +1182,180 @@ entry:
 }
 
 define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
-; CHECK-LABEL: v20:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ldr b2, [sp, #160]
-; CHECK-NEXT:    add x10, sp, #168
-; CHECK-NEXT:    ldr b3, [sp]
-; CHECK-NEXT:    add x11, sp, #8
-; CHECK-NEXT:    ldr b1, [sp, #96]
-; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
-; CHECK-NEXT:    add x9, sp, #104
-; CHECK-NEXT:    add x10, sp, #176
-; CHECK-NEXT:    mov v0.b[1], w1
-; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
-; CHECK-NEXT:    ld1 { v1.b }[1], [x9]
-; CHECK-NEXT:    add x11, sp, #16
-; CHECK-NEXT:    add x9, sp, #112
-; CHECK-NEXT:    add x13, sp, #184
-; CHECK-NEXT:    ld1 { v2.b }[2], [x10]
-; CHECK-NEXT:    add x12, sp, #120
-; CHECK-NEXT:    add x14, sp, #32
-; CHECK-NEXT:    ld1 { v3.b }[2], [x11]
-; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
-; CHECK-NEXT:    ldr b5, [sp, #64]
-; CHECK-NEXT:    mov v0.b[2], w2
-; CHECK-NEXT:    ldr b4, [sp, #224]
-; CHECK-NEXT:    add x11, sp, #128
-; CHECK-NEXT:    ld1 { v2.b }[3], [x13]
-; CHECK-NEXT:    add x13, sp, #24
-; CHECK-NEXT:    add x10, sp, #136
-; CHECK-NEXT:    ld1 { v3.b }[3], [x13]
-; CHECK-NEXT:    ld1 { v1.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #192
-; CHECK-NEXT:    add x13, sp, #200
-; CHECK-NEXT:    add x15, sp, #80
-; CHECK-NEXT:    add x9, sp, #144
-; CHECK-NEXT:    mov v0.b[3], w3
-; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
-; CHECK-NEXT:    add x12, sp, #232
-; CHECK-NEXT:    ld1 { v3.b }[4], [x14]
-; CHECK-NEXT:    add x14, sp, #72
-; CHECK-NEXT:    ld1 { v4.b }[1], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[1], [x14]
-; CHECK-NEXT:    add x14, sp, #40
-; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x13]
-; CHECK-NEXT:    add x12, sp, #208
-; CHECK-NEXT:    add x13, sp, #48
-; CHECK-NEXT:    mov v0.b[4], w4
-; CHECK-NEXT:    ld1 { v3.b }[5], [x14]
-; CHECK-NEXT:    add x14, sp, #240
-; CHECK-NEXT:    ld1 { v4.b }[2], [x14]
-; CHECK-NEXT:    ld1 { v5.b }[2], [x15]
-; CHECK-NEXT:    ld1 { v1.b }[5], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[6], [x12]
-; CHECK-NEXT:    add x11, sp, #216
-; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    ld1 { v3.b }[6], [x13]
-; CHECK-NEXT:    add x12, sp, #248
-; CHECK-NEXT:    add x13, sp, #88
-; CHECK-NEXT:    mov v0.b[5], w5
-; CHECK-NEXT:    ld1 { v4.b }[3], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[3], [x13]
-; CHECK-NEXT:    ld1 { v1.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v2.b }[7], [x11]
-; CHECK-NEXT:    add x9, sp, #152
-; CHECK-NEXT:    ld1 { v3.b }[7], [x10]
-; CHECK-NEXT:    uaddl v4.8h, v5.8b, v4.8b
-; CHECK-NEXT:    mov v0.b[6], w6
-; CHECK-NEXT:    ld1 { v1.b }[7], [x9]
-; CHECK-NEXT:    uaddl v2.8h, v3.8b, v2.8b
-; CHECK-NEXT:    ushll v3.4s, v4.4h, #0
-; CHECK-NEXT:    mov v0.b[7], w7
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ushll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    stp q1, q3, [x8, #48]
-; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    stp q3, q2, [x8, #16]
-; CHECK-NEXT:    str q0, [x8]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v20:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    ldr b2, [sp, #160]
+; CHECK-SD-NEXT:    add x10, sp, #168
+; CHECK-SD-NEXT:    ldr b3, [sp]
+; CHECK-SD-NEXT:    add x11, sp, #8
+; CHECK-SD-NEXT:    ldr b1, [sp, #96]
+; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-SD-NEXT:    add x9, sp, #104
+; CHECK-SD-NEXT:    add x10, sp, #176
+; CHECK-SD-NEXT:    mov v0.b[1], w1
+; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x11]
+; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x9]
+; CHECK-SD-NEXT:    add x11, sp, #16
+; CHECK-SD-NEXT:    add x9, sp, #112
+; CHECK-SD-NEXT:    add x13, sp, #184
+; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x10]
+; CHECK-SD-NEXT:    add x12, sp, #120
+; CHECK-SD-NEXT:    add x14, sp, #32
+; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x11]
+; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-SD-NEXT:    ldr b5, [sp, #64]
+; CHECK-SD-NEXT:    mov v0.b[2], w2
+; CHECK-SD-NEXT:    ldr b4, [sp, #224]
+; CHECK-SD-NEXT:    add x11, sp, #128
+; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #24
+; CHECK-SD-NEXT:    add x10, sp, #136
+; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x13]
+; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #192
+; CHECK-SD-NEXT:    add x13, sp, #200
+; CHECK-SD-NEXT:    add x15, sp, #80
+; CHECK-SD-NEXT:    add x9, sp, #144
+; CHECK-SD-NEXT:    mov v0.b[3], w3
+; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #232
+; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x14]
+; CHECK-SD-NEXT:    add x14, sp, #72
+; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x12]
+; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x14]
+; CHECK-SD-NEXT:    add x14, sp, #40
+; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x11]
+; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x13]
+; CHECK-SD-NEXT:    add x12, sp, #208
+; CHECK-SD-NEXT:    add x13, sp, #48
+; CHECK-SD-NEXT:    mov v0.b[4], w4
+; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x14]
+; CHECK-SD-NEXT:    add x14, sp, #240
+; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x14]
+; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x15]
+; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x10]
+; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x12]
+; CHECK-SD-NEXT:    add x11, sp, #216
+; CHECK-SD-NEXT:    add x10, sp, #56
+; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x13]
+; CHECK-SD-NEXT:    add x12, sp, #248
+; CHECK-SD-NEXT:    add x13, sp, #88
+; CHECK-SD-NEXT:    mov v0.b[5], w5
+; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x12]
+; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x13]
+; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x9]
+; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x11]
+; CHECK-SD-NEXT:    add x9, sp, #152
+; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x10]
+; CHECK-SD-NEXT:    uaddl v4.8h, v5.8b, v4.8b
+; CHECK-SD-NEXT:    mov v0.b[6], w6
+; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x9]
+; CHECK-SD-NEXT:    uaddl v2.8h, v3.8b, v2.8b
+; CHECK-SD-NEXT:    ushll v3.4s, v4.4h, #0
+; CHECK-SD-NEXT:    mov v0.b[7], w7
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT:    stp q1, q3, [x8, #48]
+; CHECK-SD-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    stp q3, q2, [x8, #16]
+; CHECK-SD-NEXT:    str q0, [x8]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v20:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s0, [sp]
+; CHECK-GI-NEXT:    ldr s4, [sp, #8]
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    ldr s2, [sp, #32]
+; CHECK-GI-NEXT:    ldr s19, [sp, #40]
+; CHECK-GI-NEXT:    fmov s3, w4
+; CHECK-GI-NEXT:    mov v0.s[1], v4.s[0]
+; CHECK-GI-NEXT:    ldr s16, [sp, #96]
+; CHECK-GI-NEXT:    ldr s22, [sp, #104]
+; CHECK-GI-NEXT:    mov v2.s[1], v19.s[0]
+; CHECK-GI-NEXT:    ldr s19, [sp, #128]
+; CHECK-GI-NEXT:    ldr s23, [sp, #136]
+; CHECK-GI-NEXT:    ldr s18, [sp, #16]
+; CHECK-GI-NEXT:    mov v1.s[1], w1
+; CHECK-GI-NEXT:    mov v3.s[1], w5
+; CHECK-GI-NEXT:    mov v16.s[1], v22.s[0]
+; CHECK-GI-NEXT:    mov v19.s[1], v23.s[0]
+; CHECK-GI-NEXT:    ldr s4, [sp, #64]
+; CHECK-GI-NEXT:    ldr s21, [sp, #72]
+; CHECK-GI-NEXT:    mov v0.s[2], v18.s[0]
+; CHECK-GI-NEXT:    ldr s18, [sp, #160]
+; CHECK-GI-NEXT:    ldr s24, [sp, #168]
+; CHECK-GI-NEXT:    ldr s20, [sp, #192]
+; CHECK-GI-NEXT:    ldr s25, [sp, #200]
+; CHECK-GI-NEXT:    ldr s22, [sp, #224]
+; CHECK-GI-NEXT:    ldr s27, [sp, #232]
+; CHECK-GI-NEXT:    ldr s23, [sp, #112]
+; CHECK-GI-NEXT:    ldr s26, [sp, #144]
+; CHECK-GI-NEXT:    mov v18.s[1], v24.s[0]
+; CHECK-GI-NEXT:    mov v20.s[1], v25.s[0]
+; CHECK-GI-NEXT:    mov v4.s[1], v21.s[0]
+; CHECK-GI-NEXT:    mov v22.s[1], v27.s[0]
+; CHECK-GI-NEXT:    mov v1.s[2], w2
+; CHECK-GI-NEXT:    ldr s17, [sp, #48]
+; CHECK-GI-NEXT:    mov v3.s[2], w6
+; CHECK-GI-NEXT:    mov v16.s[2], v23.s[0]
+; CHECK-GI-NEXT:    mov v19.s[2], v26.s[0]
+; CHECK-GI-NEXT:    ldr s7, [sp, #80]
+; CHECK-GI-NEXT:    ldr s21, [sp, #176]
+; CHECK-GI-NEXT:    ldr s24, [sp, #208]
+; CHECK-GI-NEXT:    ldr s25, [sp, #240]
+; CHECK-GI-NEXT:    mov v2.s[2], v17.s[0]
+; CHECK-GI-NEXT:    ldr s17, [sp, #120]
+; CHECK-GI-NEXT:    ldr s23, [sp, #152]
+; CHECK-GI-NEXT:    ldr s5, [sp, #24]
+; CHECK-GI-NEXT:    mov v18.s[2], v21.s[0]
+; CHECK-GI-NEXT:    mov v20.s[2], v24.s[0]
+; CHECK-GI-NEXT:    mov v4.s[2], v7.s[0]
+; CHECK-GI-NEXT:    mov v22.s[2], v25.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], w3
+; CHECK-GI-NEXT:    mov v3.s[3], w7
+; CHECK-GI-NEXT:    mov v16.s[3], v17.s[0]
+; CHECK-GI-NEXT:    mov v19.s[3], v23.s[0]
+; CHECK-GI-NEXT:    ldr s6, [sp, #56]
+; CHECK-GI-NEXT:    ldr s7, [sp, #184]
+; CHECK-GI-NEXT:    ldr s21, [sp, #216]
+; CHECK-GI-NEXT:    ldr s17, [sp, #88]
+; CHECK-GI-NEXT:    mov v0.s[3], v5.s[0]
+; CHECK-GI-NEXT:    ldr s5, [sp, #248]
+; CHECK-GI-NEXT:    mov v2.s[3], v6.s[0]
+; CHECK-GI-NEXT:    mov v18.s[3], v7.s[0]
+; CHECK-GI-NEXT:    mov v20.s[3], v21.s[0]
+; CHECK-GI-NEXT:    mov v4.s[3], v17.s[0]
+; CHECK-GI-NEXT:    mov v22.s[3], v5.s[0]
+; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    movi v3.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    uzp1 v5.8h, v16.8h, v19.8h
+; CHECK-GI-NEXT:    dup v6.4s, w8
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uzp1 v2.8h, v18.8h, v20.8h
+; CHECK-GI-NEXT:    uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-GI-NEXT:    uzp1 v6.8h, v22.8h, v6.8h
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    and v5.16b, v5.16b, v3.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    add v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v3.16b
+; CHECK-GI-NEXT:    and v3.16b, v6.16b, v3.16b
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    add v3.4h, v4.4h, v3.4h
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    stp q2, q1, [x8]
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    stp q4, q0, [x8, #32]
+; CHECK-GI-NEXT:    str q2, [x8, #64]
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <20 x i8> %s0 to <20 x i32>
   %s1s = zext <20 x i8> %s1 to <20 x i32>
@@ -534,98 +1364,165 @@ entry:
 }
 
 define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) {
-; CHECK-LABEL: i12:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -48
-; CHECK-NEXT:    ldr w13, [sp, #112]
-; CHECK-NEXT:    ldr w14, [sp, #144]
-; CHECK-NEXT:    fmov s2, w4
-; CHECK-NEXT:    ldr w17, [sp, #176]
-; CHECK-NEXT:    ldr w19, [sp, #208]
-; CHECK-NEXT:    fmov s3, w0
-; CHECK-NEXT:    ldr w20, [sp, #80]
-; CHECK-NEXT:    ldr w21, [sp, #48]
-; CHECK-NEXT:    fmov s5, w13
-; CHECK-NEXT:    fmov s4, w19
-; CHECK-NEXT:    fmov s6, w17
-; CHECK-NEXT:    fmov s7, w14
-; CHECK-NEXT:    fmov s0, w20
-; CHECK-NEXT:    fmov s1, w21
-; CHECK-NEXT:    ldr w10, [sp, #120]
-; CHECK-NEXT:    ldr w11, [sp, #152]
-; CHECK-NEXT:    ldr w12, [sp, #184]
-; CHECK-NEXT:    ldr w15, [sp, #216]
-; CHECK-NEXT:    ldr w22, [sp, #88]
-; CHECK-NEXT:    ldr w23, [sp, #56]
-; CHECK-NEXT:    mov v2.h[1], w5
-; CHECK-NEXT:    mov v3.h[1], w1
-; CHECK-NEXT:    mov v5.h[1], w10
-; CHECK-NEXT:    mov v4.h[1], w15
-; CHECK-NEXT:    mov v0.h[1], w22
-; CHECK-NEXT:    mov v1.h[1], w23
-; CHECK-NEXT:    mov v6.h[1], w12
-; CHECK-NEXT:    mov v7.h[1], w11
-; CHECK-NEXT:    ldr w8, [sp, #128]
-; CHECK-NEXT:    ldr w9, [sp, #160]
-; CHECK-NEXT:    ldr w16, [sp, #64]
-; CHECK-NEXT:    ldr w18, [sp, #96]
-; CHECK-NEXT:    ldr w10, [sp, #192]
-; CHECK-NEXT:    ldr w11, [sp, #224]
-; CHECK-NEXT:    mov v2.h[2], w6
-; CHECK-NEXT:    mov v3.h[2], w2
-; CHECK-NEXT:    mov v0.h[2], w18
-; CHECK-NEXT:    mov v1.h[2], w16
-; CHECK-NEXT:    mov v5.h[2], w8
-; CHECK-NEXT:    mov v4.h[2], w11
-; CHECK-NEXT:    mov v6.h[2], w10
-; CHECK-NEXT:    mov v7.h[2], w9
-; CHECK-NEXT:    ldr w12, [sp, #72]
-; CHECK-NEXT:    ldr w13, [sp, #104]
-; CHECK-NEXT:    ldr w8, [sp, #136]
-; CHECK-NEXT:    ldr w9, [sp, #168]
-; CHECK-NEXT:    ldr w10, [sp, #200]
-; CHECK-NEXT:    ldr w11, [sp, #232]
-; CHECK-NEXT:    mov v0.h[3], w13
-; CHECK-NEXT:    mov v1.h[3], w12
-; CHECK-NEXT:    mov v2.h[3], w7
-; CHECK-NEXT:    mov v3.h[3], w3
-; CHECK-NEXT:    mov v5.h[3], w8
-; CHECK-NEXT:    mov v4.h[3], w11
-; CHECK-NEXT:    mov v6.h[3], w10
-; CHECK-NEXT:    mov v7.h[3], w9
-; CHECK-NEXT:    movi v16.4s, #15, msl #8
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-NEXT:    ushll v5.4s, v5.4h, #0
-; CHECK-NEXT:    ushll v4.4s, v4.4h, #0
-; CHECK-NEXT:    ushll v6.4s, v6.4h, #0
-; CHECK-NEXT:    ushll v7.4s, v7.4h, #0
-; CHECK-NEXT:    and v17.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v1.16b, v2.16b, v16.16b
-; CHECK-NEXT:    and v0.16b, v3.16b, v16.16b
-; CHECK-NEXT:    and v2.16b, v5.16b, v16.16b
-; CHECK-NEXT:    and v3.16b, v4.16b, v16.16b
-; CHECK-NEXT:    and v4.16b, v6.16b, v16.16b
-; CHECK-NEXT:    and v5.16b, v7.16b, v16.16b
-; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    add v3.4s, v17.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    add v2.4s, v18.4s, v4.4s
-; CHECK-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: i12:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -48
+; CHECK-SD-NEXT:    ldr w13, [sp, #112]
+; CHECK-SD-NEXT:    ldr w14, [sp, #144]
+; CHECK-SD-NEXT:    fmov s2, w4
+; CHECK-SD-NEXT:    ldr w17, [sp, #176]
+; CHECK-SD-NEXT:    ldr w19, [sp, #208]
+; CHECK-SD-NEXT:    fmov s3, w0
+; CHECK-SD-NEXT:    ldr w20, [sp, #80]
+; CHECK-SD-NEXT:    ldr w21, [sp, #48]
+; CHECK-SD-NEXT:    fmov s5, w13
+; CHECK-SD-NEXT:    fmov s4, w19
+; CHECK-SD-NEXT:    fmov s6, w17
+; CHECK-SD-NEXT:    fmov s7, w14
+; CHECK-SD-NEXT:    fmov s0, w20
+; CHECK-SD-NEXT:    fmov s1, w21
+; CHECK-SD-NEXT:    ldr w10, [sp, #120]
+; CHECK-SD-NEXT:    ldr w11, [sp, #152]
+; CHECK-SD-NEXT:    ldr w12, [sp, #184]
+; CHECK-SD-NEXT:    ldr w15, [sp, #216]
+; CHECK-SD-NEXT:    ldr w22, [sp, #88]
+; CHECK-SD-NEXT:    ldr w23, [sp, #56]
+; CHECK-SD-NEXT:    mov v2.h[1], w5
+; CHECK-SD-NEXT:    mov v3.h[1], w1
+; CHECK-SD-NEXT:    mov v5.h[1], w10
+; CHECK-SD-NEXT:    mov v4.h[1], w15
+; CHECK-SD-NEXT:    mov v0.h[1], w22
+; CHECK-SD-NEXT:    mov v1.h[1], w23
+; CHECK-SD-NEXT:    mov v6.h[1], w12
+; CHECK-SD-NEXT:    mov v7.h[1], w11
+; CHECK-SD-NEXT:    ldr w8, [sp, #128]
+; CHECK-SD-NEXT:    ldr w9, [sp, #160]
+; CHECK-SD-NEXT:    ldr w16, [sp, #64]
+; CHECK-SD-NEXT:    ldr w18, [sp, #96]
+; CHECK-SD-NEXT:    ldr w10, [sp, #192]
+; CHECK-SD-NEXT:    ldr w11, [sp, #224]
+; CHECK-SD-NEXT:    mov v2.h[2], w6
+; CHECK-SD-NEXT:    mov v3.h[2], w2
+; CHECK-SD-NEXT:    mov v0.h[2], w18
+; CHECK-SD-NEXT:    mov v1.h[2], w16
+; CHECK-SD-NEXT:    mov v5.h[2], w8
+; CHECK-SD-NEXT:    mov v4.h[2], w11
+; CHECK-SD-NEXT:    mov v6.h[2], w10
+; CHECK-SD-NEXT:    mov v7.h[2], w9
+; CHECK-SD-NEXT:    ldr w12, [sp, #72]
+; CHECK-SD-NEXT:    ldr w13, [sp, #104]
+; CHECK-SD-NEXT:    ldr w8, [sp, #136]
+; CHECK-SD-NEXT:    ldr w9, [sp, #168]
+; CHECK-SD-NEXT:    ldr w10, [sp, #200]
+; CHECK-SD-NEXT:    ldr w11, [sp, #232]
+; CHECK-SD-NEXT:    mov v0.h[3], w13
+; CHECK-SD-NEXT:    mov v1.h[3], w12
+; CHECK-SD-NEXT:    mov v2.h[3], w7
+; CHECK-SD-NEXT:    mov v3.h[3], w3
+; CHECK-SD-NEXT:    mov v5.h[3], w8
+; CHECK-SD-NEXT:    mov v4.h[3], w11
+; CHECK-SD-NEXT:    mov v6.h[3], w10
+; CHECK-SD-NEXT:    mov v7.h[3], w9
+; CHECK-SD-NEXT:    movi v16.4s, #15, msl #8
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT:    ushll v5.4s, v5.4h, #0
+; CHECK-SD-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ushll v6.4s, v6.4h, #0
+; CHECK-SD-NEXT:    ushll v7.4s, v7.4h, #0
+; CHECK-SD-NEXT:    and v17.16b, v0.16b, v16.16b
+; CHECK-SD-NEXT:    and v18.16b, v1.16b, v16.16b
+; CHECK-SD-NEXT:    and v1.16b, v2.16b, v16.16b
+; CHECK-SD-NEXT:    and v0.16b, v3.16b, v16.16b
+; CHECK-SD-NEXT:    and v2.16b, v5.16b, v16.16b
+; CHECK-SD-NEXT:    and v3.16b, v4.16b, v16.16b
+; CHECK-SD-NEXT:    and v4.16b, v6.16b, v16.16b
+; CHECK-SD-NEXT:    and v5.16b, v7.16b, v16.16b
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    add v3.4s, v17.4s, v3.4s
+; CHECK-SD-NEXT:    add v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    add v2.4s, v18.4s, v4.4s
+; CHECK-SD-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i12:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    fmov s4, w4
+; CHECK-GI-NEXT:    ldr s0, [sp]
+; CHECK-GI-NEXT:    ldr s20, [sp, #8]
+; CHECK-GI-NEXT:    ldr s2, [sp, #32]
+; CHECK-GI-NEXT:    ldr s21, [sp, #40]
+; CHECK-GI-NEXT:    ldr s16, [sp, #64]
+; CHECK-GI-NEXT:    ldr s22, [sp, #72]
+; CHECK-GI-NEXT:    ldr s17, [sp, #96]
+; CHECK-GI-NEXT:    ldr s23, [sp, #104]
+; CHECK-GI-NEXT:    mov v1.s[1], w1
+; CHECK-GI-NEXT:    mov v4.s[1], w5
+; CHECK-GI-NEXT:    ldr s18, [sp, #128]
+; CHECK-GI-NEXT:    ldr s24, [sp, #136]
+; CHECK-GI-NEXT:    mov v0.s[1], v20.s[0]
+; CHECK-GI-NEXT:    ldr s19, [sp, #160]
+; CHECK-GI-NEXT:    ldr s25, [sp, #168]
+; CHECK-GI-NEXT:    mov v2.s[1], v21.s[0]
+; CHECK-GI-NEXT:    mov v16.s[1], v22.s[0]
+; CHECK-GI-NEXT:    mov v17.s[1], v23.s[0]
+; CHECK-GI-NEXT:    mov v18.s[1], v24.s[0]
+; CHECK-GI-NEXT:    mov v19.s[1], v25.s[0]
+; CHECK-GI-NEXT:    ldr s6, [sp, #16]
+; CHECK-GI-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-NEXT:    ldr s20, [sp, #80]
+; CHECK-GI-NEXT:    ldr s21, [sp, #112]
+; CHECK-GI-NEXT:    ldr s22, [sp, #144]
+; CHECK-GI-NEXT:    ldr s23, [sp, #176]
+; CHECK-GI-NEXT:    mov v1.s[2], w2
+; CHECK-GI-NEXT:    mov v4.s[2], w6
+; CHECK-GI-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-GI-NEXT:    mov v2.s[2], v7.s[0]
+; CHECK-GI-NEXT:    mov v16.s[2], v20.s[0]
+; CHECK-GI-NEXT:    mov v17.s[2], v21.s[0]
+; CHECK-GI-NEXT:    mov v18.s[2], v22.s[0]
+; CHECK-GI-NEXT:    mov v19.s[2], v23.s[0]
+; CHECK-GI-NEXT:    ldr s3, [sp, #24]
+; CHECK-GI-NEXT:    ldr s5, [sp, #56]
+; CHECK-GI-NEXT:    ldr s6, [sp, #88]
+; CHECK-GI-NEXT:    ldr s7, [sp, #120]
+; CHECK-GI-NEXT:    ldr s20, [sp, #152]
+; CHECK-GI-NEXT:    ldr s21, [sp, #184]
+; CHECK-GI-NEXT:    mov v1.s[3], w3
+; CHECK-GI-NEXT:    mov v4.s[3], w7
+; CHECK-GI-NEXT:    movi v22.4s, #15, msl #8
+; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
+; CHECK-GI-NEXT:    mov v2.s[3], v5.s[0]
+; CHECK-GI-NEXT:    mov v16.s[3], v6.s[0]
+; CHECK-GI-NEXT:    mov v17.s[3], v7.s[0]
+; CHECK-GI-NEXT:    mov v18.s[3], v20.s[0]
+; CHECK-GI-NEXT:    mov v19.s[3], v21.s[0]
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v22.16b
+; CHECK-GI-NEXT:    and v3.16b, v4.16b, v22.16b
+; CHECK-GI-NEXT:    and v4.16b, v0.16b, v22.16b
+; CHECK-GI-NEXT:    and v5.16b, v2.16b, v22.16b
+; CHECK-GI-NEXT:    and v0.16b, v16.16b, v22.16b
+; CHECK-GI-NEXT:    and v2.16b, v17.16b, v22.16b
+; CHECK-GI-NEXT:    and v6.16b, v18.16b, v22.16b
+; CHECK-GI-NEXT:    and v7.16b, v19.16b, v22.16b
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    add v2.4s, v4.4s, v6.4s
+; CHECK-GI-NEXT:    add v3.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i12> %s0 to <16 x i32>
   %s1s = zext <16 x i12> %s1 to <16 x i32>
@@ -634,15 +1531,25 @@ entry:
 }
 
 define <16 x i32> @sub_zz(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: sub_zz:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    usubl v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    usubl2 v4.8h, v0.16b, v1.16b
-; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sub_zz:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    usubl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sub_zz:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    usubl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
   %s1s = zext <16 x i8> %s1 to <16 x i32>
@@ -651,15 +1558,25 @@ entry:
 }
 
 define <16 x i32> @sub_ss(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: sub_ss:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ssubl v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ssubl2 v4.8h, v0.16b, v1.16b
-; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sub_ss:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ssubl2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sub_ss:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ssubl2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i32>
   %s1s = sext <16 x i8> %s1 to <16 x i32>
@@ -668,17 +1585,33 @@ entry:
 }
 
 define <16 x i32> @sub_zs(<16 x i8> %s0, <16 x i8> %s1) {
-; CHECK-LABEL: sub_zs:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    ssubw v2.8h, v2.8h, v1.8b
-; CHECK-NEXT:    ssubw2 v4.8h, v0.8h, v1.16b
-; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
-; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sub_zs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT:    ssubw v2.8h, v2.8h, v1.8b
+; CHECK-SD-NEXT:    ssubw2 v4.8h, v0.8h, v1.16b
+; CHECK-SD-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sub_zs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v4.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v6.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ssubw v0.4s, v1.4s, v3.4h
+; CHECK-GI-NEXT:    ssubw2 v1.4s, v2.4s, v3.8h
+; CHECK-GI-NEXT:    ssubw v2.4s, v5.4s, v4.4h
+; CHECK-GI-NEXT:    ssubw2 v3.4s, v6.4s, v4.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
   %s1s = sext <16 x i8> %s1 to <16 x i32>
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
index 866b27b81d885..c91de8f3a0a47 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
@@ -8,36 +8,39 @@
 define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-LABEL: run_test:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #192
-; CHECK-NEXT:    .cfi_def_cfa_offset 192
+; CHECK-NEXT:    sub sp, sp, #208
+; CHECK-NEXT:    .cfi_def_cfa_offset 208
 ; CHECK-NEXT:    stp d15, d14, [sp, #96] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #112] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #128] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #144] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT:    str x23, [sp, #160] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #192] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset b8, -40
-; CHECK-NEXT:    .cfi_offset b9, -48
-; CHECK-NEXT:    .cfi_offset b10, -56
-; CHECK-NEXT:    .cfi_offset b11, -64
-; CHECK-NEXT:    .cfi_offset b12, -72
-; CHECK-NEXT:    .cfi_offset b13, -80
-; CHECK-NEXT:    .cfi_offset b14, -88
-; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    .cfi_offset w23, -48
+; CHECK-NEXT:    .cfi_offset b8, -56
+; CHECK-NEXT:    .cfi_offset b9, -64
+; CHECK-NEXT:    .cfi_offset b10, -72
+; CHECK-NEXT:    .cfi_offset b11, -80
+; CHECK-NEXT:    .cfi_offset b12, -88
+; CHECK-NEXT:    .cfi_offset b13, -96
+; CHECK-NEXT:    .cfi_offset b14, -104
+; CHECK-NEXT:    .cfi_offset b15, -112
 ; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    // implicit-def: $q1
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    adrp x10, B+48
-; CHECK-NEXT:    add x10, x10, :lo12:B+48
-; CHECK-NEXT:    adrp x11, A
-; CHECK-NEXT:    add x11, x11, :lo12:A
+; CHECK-NEXT:    adrp x9, B+48
+; CHECK-NEXT:    add x9, x9, :lo12:B+48
+; CHECK-NEXT:    adrp x10, A
+; CHECK-NEXT:    add x10, x10, :lo12:A
+; CHECK-NEXT:    mov x11, xzr
 ; CHECK-NEXT:    // kill: killed $q1
 ; CHECK-NEXT:    // implicit-def: $q1
+; CHECK-NEXT:    mov x12, xzr
 ; CHECK-NEXT:    // implicit-def: $q0
 ; CHECK-NEXT:    // implicit-def: $q3
 ; CHECK-NEXT:    // implicit-def: $q4
@@ -69,103 +72,102 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    // kill: killed $q1
 ; CHECK-NEXT:  .LBB0_1: // %for.cond1.preheader
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str q14, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q14, [x8]
-; CHECK-NEXT:    mov x12, xzr
-; CHECK-NEXT:    ldr x14, [x12]
 ; CHECK-NEXT:    stp q29, q15, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    add x19, x11, x8
-; CHECK-NEXT:    fmov x15, d14
-; CHECK-NEXT:    mov x16, v14.d[1]
-; CHECK-NEXT:    ldr q15, [x12]
-; CHECK-NEXT:    ldr q14, [x10], #64
+; CHECK-NEXT:    ldr q15, [x8]
+; CHECK-NEXT:    ldr x15, [x8]
+; CHECK-NEXT:    str q14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    add x20, x10, x11
 ; CHECK-NEXT:    mov v8.16b, v28.16b
-; CHECK-NEXT:    fmov x13, d15
-; CHECK-NEXT:    mov x18, v15.d[1]
+; CHECK-NEXT:    fmov x2, d15
+; CHECK-NEXT:    mov x17, v15.d[1]
+; CHECK-NEXT:    ldr q14, [x8]
 ; CHECK-NEXT:    mov v28.16b, v24.16b
-; CHECK-NEXT:    mul x17, x15, x14
-; CHECK-NEXT:    mov x12, v14.d[1]
-; CHECK-NEXT:    fmov x4, d14
 ; CHECK-NEXT:    mov v24.16b, v20.16b
 ; CHECK-NEXT:    mov v20.16b, v17.16b
+; CHECK-NEXT:    fmov x13, d14
+; CHECK-NEXT:    mov x16, v14.d[1]
 ; CHECK-NEXT:    mov v17.16b, v5.16b
-; CHECK-NEXT:    mul x1, x16, x14
+; CHECK-NEXT:    mul x3, x2, x15
+; CHECK-NEXT:    ldr q14, [x9], #64
 ; CHECK-NEXT:    ldr q5, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x5, [x8]
-; CHECK-NEXT:    ldr x19, [x19, #128]
+; CHECK-NEXT:    ldr x6, [x8]
+; CHECK-NEXT:    ldr x20, [x20, #128]
+; CHECK-NEXT:    mul x1, x17, x15
+; CHECK-NEXT:    mov x14, v14.d[1]
+; CHECK-NEXT:    fmov x5, d14
 ; CHECK-NEXT:    mov v29.16b, v21.16b
 ; CHECK-NEXT:    mov v21.16b, v0.16b
-; CHECK-NEXT:    mul x0, x13, x14
 ; CHECK-NEXT:    mov v25.16b, v6.16b
+; CHECK-NEXT:    mul x18, x13, x15
 ; CHECK-NEXT:    mov v6.16b, v2.16b
-; CHECK-NEXT:    fmov d15, x17
 ; CHECK-NEXT:    mov v26.16b, v22.16b
+; CHECK-NEXT:    fmov d15, x3
 ; CHECK-NEXT:    mov v22.16b, v18.16b
-; CHECK-NEXT:    mul x2, x18, x14
 ; CHECK-NEXT:    mov v18.16b, v7.16b
+; CHECK-NEXT:    mul x0, x16, x15
 ; CHECK-NEXT:    mov v7.16b, v3.16b
 ; CHECK-NEXT:    mov v16.16b, v4.16b
-; CHECK-NEXT:    add x8, x8, #8
-; CHECK-NEXT:    add x9, x9, #1
+; CHECK-NEXT:    add x11, x11, #8
+; CHECK-NEXT:    add x12, x12, #1
 ; CHECK-NEXT:    mov v15.d[1], x1
-; CHECK-NEXT:    mul x3, x12, x14
-; CHECK-NEXT:    cmp x8, #64
-; CHECK-NEXT:    fmov d14, x0
-; CHECK-NEXT:    mul x14, x4, x14
+; CHECK-NEXT:    mul x4, x14, x15
+; CHECK-NEXT:    cmp x11, #64
+; CHECK-NEXT:    fmov d14, x18
+; CHECK-NEXT:    mul x15, x5, x15
 ; CHECK-NEXT:    add v5.2d, v5.2d, v15.2d
-; CHECK-NEXT:    mul x20, x15, x5
-; CHECK-NEXT:    mov v14.d[1], x2
-; CHECK-NEXT:    mul x15, x15, x19
-; CHECK-NEXT:    fmov d0, x14
+; CHECK-NEXT:    mul x21, x2, x6
+; CHECK-NEXT:    mov v14.d[1], x0
+; CHECK-NEXT:    mul x2, x2, x20
+; CHECK-NEXT:    fmov d0, x15
 ; CHECK-NEXT:    str q5, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    ldr q5, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    mul x21, x13, x19
+; CHECK-NEXT:    mul x22, x13, x20
 ; CHECK-NEXT:    add v5.2d, v5.2d, v14.2d
-; CHECK-NEXT:    fmov d3, x20
-; CHECK-NEXT:    mul x7, x16, x5
-; CHECK-NEXT:    mov v0.d[1], x3
-; CHECK-NEXT:    fmov d1, x15
-; CHECK-NEXT:    mul x16, x16, x19
+; CHECK-NEXT:    fmov d3, x21
+; CHECK-NEXT:    mul x19, x17, x6
+; CHECK-NEXT:    mov v0.d[1], x4
+; CHECK-NEXT:    fmov d1, x2
+; CHECK-NEXT:    mul x17, x17, x20
 ; CHECK-NEXT:    str q5, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    add v5.2d, v13.2d, v14.2d
-; CHECK-NEXT:    fmov d2, x21
+; CHECK-NEXT:    fmov d2, x22
 ; CHECK-NEXT:    ldr q13, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    mul x6, x18, x5
+; CHECK-NEXT:    mul x7, x16, x6
 ; CHECK-NEXT:    ldp q15, q14, [sp, #16] // 32-byte Folded Reload
-; CHECK-NEXT:    mov v3.d[1], x7
+; CHECK-NEXT:    mov v3.d[1], x19
 ; CHECK-NEXT:    add v13.2d, v13.2d, v0.2d
-; CHECK-NEXT:    mul x18, x18, x19
-; CHECK-NEXT:    mov v1.d[1], x16
-; CHECK-NEXT:    mul x22, x4, x19
+; CHECK-NEXT:    mul x16, x16, x20
+; CHECK-NEXT:    mov v1.d[1], x17
+; CHECK-NEXT:    mul x23, x5, x20
 ; CHECK-NEXT:    str q13, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov v13.16b, v5.16b
 ; CHECK-NEXT:    mov v5.16b, v17.16b
 ; CHECK-NEXT:    mov v17.16b, v20.16b
 ; CHECK-NEXT:    mov v20.16b, v24.16b
-; CHECK-NEXT:    mul x13, x13, x5
+; CHECK-NEXT:    mul x13, x13, x6
 ; CHECK-NEXT:    mov v24.16b, v28.16b
 ; CHECK-NEXT:    add v11.2d, v11.2d, v3.2d
-; CHECK-NEXT:    mov v2.d[1], x18
+; CHECK-NEXT:    mov v2.d[1], x16
 ; CHECK-NEXT:    add v15.2d, v15.2d, v1.2d
 ; CHECK-NEXT:    add v27.2d, v27.2d, v3.2d
-; CHECK-NEXT:    mul x17, x12, x19
+; CHECK-NEXT:    mul x18, x14, x20
 ; CHECK-NEXT:    add v23.2d, v23.2d, v3.2d
 ; CHECK-NEXT:    add v19.2d, v19.2d, v3.2d
-; CHECK-NEXT:    fmov d4, x22
+; CHECK-NEXT:    fmov d4, x23
 ; CHECK-NEXT:    add v10.2d, v10.2d, v3.2d
-; CHECK-NEXT:    mul x14, x4, x5
+; CHECK-NEXT:    mul x15, x5, x6
 ; CHECK-NEXT:    fmov d0, x13
 ; CHECK-NEXT:    add v14.2d, v14.2d, v2.2d
 ; CHECK-NEXT:    add v2.2d, v6.2d, v3.2d
-; CHECK-NEXT:    mul x12, x12, x5
+; CHECK-NEXT:    mul x14, x14, x6
 ; CHECK-NEXT:    mov v3.16b, v7.16b
 ; CHECK-NEXT:    mov v7.16b, v18.16b
-; CHECK-NEXT:    mov v4.d[1], x17
+; CHECK-NEXT:    mov v4.d[1], x18
 ; CHECK-NEXT:    mov v18.16b, v22.16b
-; CHECK-NEXT:    mov v0.d[1], x6
-; CHECK-NEXT:    fmov d1, x14
+; CHECK-NEXT:    mov v0.d[1], x7
+; CHECK-NEXT:    fmov d1, x15
 ; CHECK-NEXT:    add v28.2d, v8.2d, v4.2d
-; CHECK-NEXT:    mov v1.d[1], x12
+; CHECK-NEXT:    mov v1.d[1], x14
 ; CHECK-NEXT:    add v31.2d, v31.2d, v0.2d
 ; CHECK-NEXT:    add v30.2d, v30.2d, v0.2d
 ; CHECK-NEXT:    add v12.2d, v12.2d, v0.2d
@@ -192,11 +194,12 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    adrp x8, C
 ; CHECK-NEXT:    add x8, x8, :lo12:C
 ; CHECK-NEXT:    stp q11, q30, [x8, #80]
-; CHECK-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
 ; CHECK-NEXT:    str q1, [x8]
 ; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x23, [sp, #160] // 8-byte Folded Reload
 ; CHECK-NEXT:    stp q15, q14, [x8, #144]
-; CHECK-NEXT:    ldp x22, x21, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
 ; CHECK-NEXT:    stp q1, q13, [x8, #16]
 ; CHECK-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    stp q28, q12, [x8, #176]
@@ -216,12 +219,13 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    stp q5, q4, [x8, #432]
 ; CHECK-NEXT:    stp q2, q3, [x8, #464]
 ; CHECK-NEXT:    str q0, [x8, #496]
-; CHECK-NEXT:    add sp, sp, #192
+; CHECK-NEXT:    add sp, sp, #208
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w19
 ; CHECK-NEXT:    .cfi_restore w20
 ; CHECK-NEXT:    .cfi_restore w21
 ; CHECK-NEXT:    .cfi_restore w22
+; CHECK-NEXT:    .cfi_restore w23
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
diff --git a/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll b/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll
index 95abbb6979be8..af664549a472a 100644
--- a/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll
+++ b/llvm/test/CodeGen/AArch64/relaxed-fp-atomics.ll
@@ -91,4 +91,94 @@ define void @atomic_store_relaxed_f64(ptr %p, i32 %off32, i64 %off64, double %va
   ret void
 }
 
+define half @atomic_load_relaxed_f16(ptr %p, i32 %off32, i64 %off64) #0 {
+; CHECK-LABEL: atomic_load_relaxed_f16:
+  %ptr_unsigned = getelementptr half, ptr %p, i32 4095
+  %val_unsigned = load atomic half, ptr %ptr_unsigned monotonic, align 4
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr half, ptr %p, i32 %off32
+  %val_regoff = load atomic half, ptr %ptr_regoff unordered, align 4
+  %tot1 = fadd half %val_unsigned, %val_regoff
+; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_regoff64 = getelementptr half, ptr %p, i64 %off64
+  %val_regoff64 = load atomic half, ptr %ptr_regoff64 monotonic, align 4
+  %tot2 = fadd half %tot1, %val_regoff64
+; CHECK: ldrh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+  %ptr_unscaled = getelementptr half, ptr %p, i32 -64
+  %val_unscaled = load atomic half, ptr %ptr_unscaled unordered, align 4
+  %tot3 = fadd half %tot2, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-128]
+
+  ret half %tot3
+}
+
+define bfloat @atomic_load_relaxed_bf16(ptr %p, i32 %off32, i64 %off64) #0 {
+; CHECK-LABEL: atomic_load_relaxed_bf16:
+  %ptr_unsigned = getelementptr bfloat, ptr %p, i32 4095
+  %val_unsigned = load atomic bfloat, ptr %ptr_unsigned monotonic, align 4
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr bfloat, ptr %p, i32 %off32
+  %val_regoff = load atomic bfloat, ptr %ptr_regoff unordered, align 4
+  %tot1 = fadd bfloat %val_unsigned, %val_regoff
+; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_regoff64 = getelementptr bfloat, ptr %p, i64 %off64
+  %val_regoff64 = load atomic bfloat, ptr %ptr_regoff64 monotonic, align 4
+  %tot2 = fadd bfloat %tot1, %val_regoff64
+; CHECK: ldrh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+  %ptr_unscaled = getelementptr bfloat, ptr %p, i32 -64
+  %val_unscaled = load atomic bfloat, ptr %ptr_unscaled unordered, align 4
+  %tot3 = fadd bfloat %tot2, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-128]
+
+  ret bfloat %tot3
+}
+
+define void @atomic_store_relaxed_f16(ptr %p, i32 %off32, i64 %off64, half %val) #0 {
+; CHECK-LABEL: atomic_store_relaxed_f16:
+  %ptr_unsigned = getelementptr half, ptr %p, i32 4095
+  store atomic half %val, ptr %ptr_unsigned monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr half, ptr %p, i32 %off32
+  store atomic half %val, ptr %ptr_regoff unordered, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_regoff64 = getelementptr half, ptr %p, i64 %off64
+  store atomic half %val, ptr %ptr_regoff64 monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+  %ptr_unscaled = getelementptr half, ptr %p, i32 -64
+  store atomic half %val, ptr %ptr_unscaled unordered, align 4
+; CHECK: sturh {{w[0-9]+}}, [x0, #-128]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_bf16(ptr %p, i32 %off32, i64 %off64, bfloat %val) #0 {
+; CHECK-LABEL: atomic_store_relaxed_bf16:
+  %ptr_unsigned = getelementptr bfloat, ptr %p, i32 4095
+  store atomic bfloat %val, ptr %ptr_unsigned monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr bfloat, ptr %p, i32 %off32
+  store atomic bfloat %val, ptr %ptr_regoff unordered, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_regoff64 = getelementptr bfloat, ptr %p, i64 %off64
+  store atomic bfloat %val, ptr %ptr_regoff64 monotonic, align 4
+; CHECK: strh {{w[0-9]+}}, [x0, x2, lsl #1]
+
+  %ptr_unscaled = getelementptr bfloat, ptr %p, i32 -64
+  store atomic bfloat %val, ptr %ptr_unscaled unordered, align 4
+; CHECK: sturh {{w[0-9]+}}, [x0, #-128]
+
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
index bc1db878cbd31..611cdcda157e2 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; FCVT
@@ -139,6 +139,15 @@ define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale
   ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
+define {<vscale x 4 x float>, <vscale x 4 x float>}  @multi_vector_cvt_widen_x2_f16(<vscale x 8 x half> %zn0) {
+; CHECK-LABEL: multi_vector_cvt_widen_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvt { z0.s, z1.s }, z0.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvt.widen.x2.nxv4f32(<vscale x 8 x half> %zn0)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
+}
+
 declare <vscale x 8 x half> @llvm.aarch64.sve.fcvt.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.bfcvt.x2(<vscale x 4 x float>, <vscale x 4 x float>)
 declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
new file mode 100644
index 0000000000000..30dc7cbfaea6c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvtl.ll
@@ -0,0 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme-f16f16 -verify-machineinstrs < %s | FileCheck %s
+
+define {<vscale x 4 x float>, <vscale x 4 x float>}  @multi_vector_cvtl_widen_x2_f16(<vscale x 8 x half> %zn0) {
+; CHECK-LABEL: multi_vector_cvtl_widen_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl { z0.s, z1.s }, z0.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.fcvtl.widen.x2.nxv4f32(<vscale x 8 x half> %zn0)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
+}
diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll
new file mode 100644
index 0000000000000..ba77637580f4c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-zero.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define  void @test_svzero_za64_vg1x2(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg1x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg1x2_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg1x2_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 7, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  %slice.max = add i32 %slice, 7
+  tail call void @llvm.aarch64.sme.zero.za64.vg1x2(i32 %slice.max)
+  ret void
+}
+
+define  void @test_svzero_za64_vg1x4(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg1x4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg1x4_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg1x4_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 1, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  %slice.min = add i32 %slice, 1
+  tail call void @llvm.aarch64.sme.zero.za64.vg1x4(i32 %slice.min)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x1(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:1]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x1_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x1_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 6:7]
+; CHECK-NEXT:    ret
+entry:
+  %slice.max = add i32 %slice, 6
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x1(i32 %slice.max)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x2(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:1, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x2_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x2_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 2:3, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  %slice.max = add i32 %slice, 2
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x2(i32 %slice.max)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x4(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:1, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg2x4_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg2x4_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add w8, w0, #1
+; CHECK-NEXT:    zero za.d[w8, 0:1, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  %slice.min = add i32 %slice, 1
+  tail call void @llvm.aarch64.sme.zero.za64.vg2x4(i32 %slice.min)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x1(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:3]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x1_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x1_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 4:7]
+; CHECK-NEXT:    ret
+entry:
+  %slice.max = add i32 %slice, 4
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x1(i32 %slice.max)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x2(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:3, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x2_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x2_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:3, vgx2]
+; CHECK-NEXT:    ret
+entry:
+  %slice.max = add i32 %slice, 0
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x2(i32 %slice.max)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x4(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    zero za.d[w8, 0:3, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 %slice)
+  ret void
+}
+
+define  void @test_svzero_za64_vg4x4_offset(i32  %slice)  #0 {
+; CHECK-LABEL: test_svzero_za64_vg4x4_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add w8, w0, #1
+; CHECK-NEXT:    zero za.d[w8, 0:3, vgx4]
+; CHECK-NEXT:    ret
+entry:
+  %slice.min = add i32 %slice, 1
+  tail call void @llvm.aarch64.sme.zero.za64.vg4x4(i32 %slice.min)
+  ret void
+}
+
+attributes #0 = { nounwind "target-features" = "+sme2p1"}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index e843537c10a33..4cdb175f55c9c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -14,6 +15,19 @@ define <4 x i8> @vls_sve_and_4xi8(<4 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i8> %b, <i8 0, i8 255, i8 0, i8 255>
  ret <4 x i8> %c
 }
@@ -27,6 +41,25 @@ define <8 x i8> @vls_sve_and_8xi8(<8 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_8xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <8 x i8> %c
 }
@@ -40,6 +73,37 @@ define <16 x i8> @vls_sve_and_16xi8(<16 x i8> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_16xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i8> %b, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <16 x i8> %c
 }
@@ -56,6 +120,61 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_32xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #62]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
  %b = and <32 x i8> %ap, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255,
                          i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
  ret <32 x i8> %b
@@ -73,6 +192,15 @@ define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_2xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i16> %b, <i16 0, i16 65535>
  ret <2 x i16> %c
 }
@@ -86,6 +214,19 @@ define <4 x i16> @vls_sve_and_4xi16(<4 x i16> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535>
  ret <4 x i16> %c
 }
@@ -99,6 +240,25 @@ define <8 x i16> @vls_sve_and_8xi16(<8 x i16> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_8xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <8 x i16> %c
 }
@@ -115,6 +275,37 @@ define <16 x i16> @vls_sve_and_16xi16(<16 x i16> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_16xi16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #60]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
  %c = and <16 x i16> %b, <i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535, i16 0, i16 65535>
  ret <16 x i16> %c
 }
@@ -128,6 +319,15 @@ define <2 x i32> @vls_sve_and_2xi32(<2 x i32> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_2xi32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i32> %b, <i32 0, i32 4294967295>
  ret <2 x i32> %c
 }
@@ -141,6 +341,17 @@ define <4 x i32> @vls_sve_and_4xi32(<4 x i32> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <4 x i32> %c
 }
@@ -157,6 +368,21 @@ define <8 x i32> @vls_sve_and_8xi32(<8 x i32> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_8xi32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
  %c = and <8 x i32> %b, <i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295, i32 0, i32 4294967295>
  ret <8 x i32> %c
 }
@@ -170,6 +396,15 @@ define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_2xi64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp xzr, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
  %c = and <2 x i64> %b, <i64 0, i64 18446744073709551615>
  ret <2 x i64> %c
 }
@@ -185,6 +420,20 @@ define <4 x i64> @vls_sve_and_4xi64(<4 x i64> %b) nounwind {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: vls_sve_and_4xi64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp xzr, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp xzr, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
  %c = and <4 x i64> %b, <i64 0, i64 18446744073709551615, i64 0, i64 18446744073709551615>
  ret <4 x i64> %c
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index aa42d5c2a8c13..f920efeb4892d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,30 @@ define <4 x i8> @ctlz_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    sub z0.h, z0.h, #8 // =0x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w9
+; NONEON-NOSVE-NEXT:    clz w10, w10
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    sub w9, w9, #24
+; NONEON-NOSVE-NEXT:    sub w10, w10, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w11
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -30,6 +55,46 @@ define <8 x i8> @ctlz_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -42,6 +107,78 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -55,6 +192,144 @@ define void @ctlz_v32i8(ptr %a) {
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -71,6 +346,21 @@ define <2 x i16> @ctlz_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    sub z0.s, z0.s, #16 // =0x10
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w9
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    sub w9, w9, #16
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -83,6 +373,30 @@ define <4 x i16> @ctlz_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -95,6 +409,46 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -108,6 +462,80 @@ define void @ctlz_v16i16(ptr %a) {
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    sub w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -122,6 +550,19 @@ define <2 x i32> @ctlz_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -134,6 +575,24 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -147,6 +606,36 @@ define void @ctlz_v8i32(ptr %a) {
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -161,6 +650,17 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -173,6 +673,19 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -186,6 +699,26 @@ define void @ctlz_v4i64(ptr %a) {
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctlz_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -205,6 +738,41 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #66]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d2, x10
+; NONEON-NOSVE-NEXT:    fmov d3, x8
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    cnt v1.8b, v1.8b
+; NONEON-NOSVE-NEXT:    cnt v2.8b, v2.8b
+; NONEON-NOSVE-NEXT:    cnt v3.8b, v3.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h1, v1.8b
+; NONEON-NOSVE-NEXT:    uaddlv h2, v2.8b
+; NONEON-NOSVE-NEXT:    uaddlv h3, v3.8b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -217,6 +785,71 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    str d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #135]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #133]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #131]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #129]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #143]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #141]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #139]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #137]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -229,6 +862,130 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #271]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #270]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #269]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #267]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #266]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #265]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #263]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #262]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #261]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #259]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #258]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #257]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #287]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #285]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #283]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -242,6 +999,244 @@ define void @ctpop_v32i8(ptr %a) {
 ; CHECK-NEXT:    cnt z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #576
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 592
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #543]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #542]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #541]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #540]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #539]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #538]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #537]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #536]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #535]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #534]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #533]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #532]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #531]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #530]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #529]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #528]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #527]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #526]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #496]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #525]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #480]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #524]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #464]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #523]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #522]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #521]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #416]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #519]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #518]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #517]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #516]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #515]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #514]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #513]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #575]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #574]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #573]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #572]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #571]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #570]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #569]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #568]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #567]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #566]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #565]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #564]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #563]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #562]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #561]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #559]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #558]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #557]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #556]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #555]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #554]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #400]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #553]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #384]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #368]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #551]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #549]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #548]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #547]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #546]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #545]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #544]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #576
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -257,6 +1252,27 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    fmov d1, x9
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    cnt v1.8b, v1.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h1, v1.8b
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -269,6 +1285,43 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -281,6 +1334,71 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -294,6 +1412,132 @@ define void @ctpop_v16i16(ptr %a) {
 ; CHECK-NEXT:    cnt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #270]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #266]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #262]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #258]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #318]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #314]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #310]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #306]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #302]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #290]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -308,6 +1552,28 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -320,6 +1586,41 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -333,6 +1634,69 @@ define void @ctpop_v8i32(ptr %a) {
 ; CHECK-NEXT:    cnt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -347,6 +1711,19 @@ define <1 x i64> @ctpop_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -359,6 +1736,27 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -372,6 +1770,41 @@ define void @ctpop_v4i64(ptr %a) {
 ; CHECK-NEXT:    cnt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ctpop_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #120]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #112]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cnt v0.8b, v0.8b
+; NONEON-NOSVE-NEXT:    str x8, [sp, #104]
+; NONEON-NOSVE-NEXT:    uaddlv h0, v0.8b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -392,6 +1825,34 @@ define <4 x i8> @cttz_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -405,6 +1866,54 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -418,6 +1927,94 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -433,6 +2030,176 @@ define void @cttz_v32i8(ptr %a) {
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x100
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -449,6 +2216,23 @@ define <2 x i16> @cttz_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -462,6 +2246,34 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -475,6 +2287,54 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -490,6 +2350,96 @@ define void @cttz_v16i16(ptr %a) {
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x10000
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -505,6 +2455,21 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -518,6 +2483,28 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -533,6 +2520,44 @@ define void @cttz_v8i32(ptr %a) {
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    clz w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -548,6 +2573,18 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -561,6 +2598,21 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -576,6 +2628,30 @@ define void @cttz_v4i64(ptr %a) {
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: cttz_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    clz x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 260ad16581f13..41065b3602003 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -11,6 +12,18 @@ define void @bitcast_v4i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrb w11, [x0, #3]
+; NONEON-NOSVE-NEXT:    strb w11, [x1, #3]
+; NONEON-NOSVE-NEXT:    strb w10, [x1, #2]
+; NONEON-NOSVE-NEXT:    strb w9, [x1, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i8>, ptr %a
   %cast = bitcast <4 x i8> %load to <4 x i8>
   store volatile <4 x i8> %cast, ptr %b
@@ -23,6 +36,12 @@ define void @bitcast_v8i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i8>, ptr %a
   %cast = bitcast <8 x i8> %load to <8 x i8>
   store volatile <8 x i8> %cast, ptr %b
@@ -35,6 +54,12 @@ define void @bitcast_v16i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <16 x i8>, ptr %a
   %cast = bitcast <16 x i8> %load to <16 x i8>
   store volatile <16 x i8> %cast, ptr %b
@@ -49,6 +74,14 @@ define void @bitcast_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <32 x i8>, ptr %a
   %cast = bitcast <32 x i8> %load to <32 x i8>
   store volatile <32 x i8> %cast, ptr %b
@@ -72,6 +105,26 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i16>, ptr %a
   %cast = bitcast <2 x i16> %load to <2 x half>
   store volatile <2 x half> %cast, ptr %b
@@ -84,6 +137,12 @@ define void @bitcast_v4i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i16>, ptr %a
   %cast = bitcast <4 x i16> %load to <4 x half>
   store volatile <4 x half> %cast, ptr %b
@@ -96,6 +155,12 @@ define void @bitcast_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i16>, ptr %a
   %cast = bitcast <8 x i16> %load to <8 x half>
   store volatile <8 x half> %cast, ptr %b
@@ -110,6 +175,14 @@ define void @bitcast_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <16 x i16>, ptr %a
   %cast = bitcast <16 x i16> %load to <16 x half>
   store volatile <16 x half> %cast, ptr %b
@@ -122,6 +195,12 @@ define void @bitcast_v2i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i32>, ptr %a
   %cast = bitcast <2 x i32> %load to <2 x float>
   store volatile <2 x float> %cast, ptr %b
@@ -134,6 +213,12 @@ define void @bitcast_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i32>, ptr %a
   %cast = bitcast <4 x i32> %load to <4 x float>
   store volatile <4 x float> %cast, ptr %b
@@ -148,6 +233,14 @@ define void @bitcast_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <8 x i32>, ptr %a
   %cast = bitcast <8 x i32> %load to <8 x float>
   store volatile <8 x float> %cast, ptr %b
@@ -160,6 +253,12 @@ define void @bitcast_v1i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <1 x i64>, ptr %a
   %cast = bitcast <1 x i64> %load to <1 x double>
   store volatile <1 x double> %cast, ptr %b
@@ -172,6 +271,12 @@ define void @bitcast_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <2 x i64>, ptr %a
   %cast = bitcast <2 x i64> %load to <2 x double>
   store volatile <2 x double> %cast, ptr %b
@@ -186,6 +291,14 @@ define void @bitcast_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x1, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitcast_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %load = load volatile <4 x i64>, ptr %a
   %cast = bitcast <4 x i64> %load to <4 x double>
   store volatile <4 x double> %cast, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
index 9a07bd8bd5ac9..b908dd61f2401 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64"
 
@@ -30,6 +31,43 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x2]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    neg w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    bsl v0.16b, v3.16b, v5.16b
+; NONEON-NOSVE-NEXT:    bsl v1.16b, v2.16b, v4.16b
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %pre_cond = load <8 x i32>, ptr %pre_cond_ptr
   %left = load <8 x i32>, ptr %left_ptr
   %right = load <8 x i32>, ptr %right_ptr
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index aec434b4819d7..318a9cf7d738b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -10,6 +11,12 @@ define void @build_vector_7_inc1_v4i1(ptr %a) {
 ; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    strb w8, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_7_inc1_v4i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    strb w8, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i1> <i1 true, i1 false, i1 true, i1 false>, ptr %a, align 1
   ret void
 }
@@ -23,6 +30,15 @@ define void @build_vector_7_inc1_v32i8(ptr %a) {
 ; CHECK-NEXT:    add z1.b, z1.b, #23 // =0x17
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_7_inc1_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI1_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI1_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <32 x i8> <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38>, ptr %a, align 1
   ret void
 }
@@ -35,6 +51,15 @@ define void @build_vector_0_inc2_v16i16(ptr %a) {
 ; CHECK-NEXT:    add z0.h, z0.h, #16 // =0x10
 ; CHECK-NEXT:    str q0, [x0, #16]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_inc2_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI2_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI2_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x i16> <i16 0, i16 2, i16 4, i16 6, i16 8, i16 10, i16 12, i16 14, i16 16, i16 18, i16 20, i16 22, i16 24, i16 26, i16 28, i16 30>, ptr %a, align 2
   ret void
 }
@@ -48,6 +73,15 @@ define void @build_vector_0_dec3_v8i32(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z0.s, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI3_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI3_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i32> <i32 0, i32 -3, i32 -6, i32 -9, i32 -12, i32 -15, i32 -18, i32 -21>, ptr %a, align 4
   ret void
 }
@@ -64,6 +98,15 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) {
 ; CHECK-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_minus2_dec32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI4_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI4_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI4_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> <i64 -2, i64 -34, i64 -66, i64 -98>, ptr %a, align 8
   ret void
 }
@@ -76,6 +119,15 @@ define void @build_vector_no_stride_v4i64(ptr %a) {
 ; CHECK-NEXT:    index z1.d, #0, #4
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_no_stride_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI5_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI5_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> <i64 0, i64 4, i64 1, i64 8>, ptr %a, align 8
   ret void
 }
@@ -89,6 +141,15 @@ define void @build_vector_0_inc2_v16f16(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI6_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_inc2_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI6_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI6_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x half> <half 0.0, half 2.0, half 4.0, half 6.0, half 8.0, half 10.0, half 12.0, half 14.0, half 16.0, half 18.0, half 20.0, half 22.0, half 24.0, half 26.0, half 28.0, half 30.0>, ptr %a, align 2
   ret void
 }
@@ -103,6 +164,15 @@ define void @build_vector_0_dec3_v8f32(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI7_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_0_dec3_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI7_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI7_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x float> <float 0.0, float -3.0, float -6.0, float -9.0, float -12.0, float -15.0, float -18.0, float -21.0>, ptr %a, align 4
   ret void
 }
@@ -117,6 +187,15 @@ define void @build_vector_minus2_dec32_v4f64(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI8_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_minus2_dec32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI8_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x double> <double -2.0, double -34.0, double -66.0, double -98.0>, ptr %a, align 8
   ret void
 }
@@ -131,6 +210,15 @@ define void @build_vector_no_stride_v4f64(ptr %a) {
 ; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI9_1]
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: build_vector_no_stride_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI9_1
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI9_1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x double> <double 0.0, double 4.0, double 1.0, double 8.0>, ptr %a, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 82e75d6efda35..a845c3cbdc2b6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -40,6 +41,31 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %res
 }
@@ -53,6 +79,13 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
@@ -65,6 +98,13 @@ define void @concat_v32i8(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
   %res = shufflevector <16 x i8> %op1, <16 x i8> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -83,6 +123,14 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = shufflevector <32 x i8> %op1, <32 x i8> %op2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -121,6 +169,21 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i16> %res
 }
@@ -135,6 +198,13 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -146,6 +216,13 @@ define void @concat_v16i16(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %res = shufflevector <8 x i16> %op1, <8 x i16> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -162,6 +239,14 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = shufflevector <16 x i16> %op1, <16 x i16> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -185,6 +270,18 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2)  {
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i32> %res
 }
@@ -199,6 +296,13 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2)  {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
@@ -210,6 +314,13 @@ define void @concat_v8i32(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
   %res = shufflevector <4 x i32> %op1, <4 x i32> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -225,6 +336,14 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = shufflevector <8 x i32> %op1, <8 x i32> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -247,6 +366,13 @@ define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2)  {
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %res
 }
@@ -258,6 +384,13 @@ define void @concat_v4i64(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %res = shufflevector <2 x i64> %op1, <2 x i64> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -273,6 +406,14 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = shufflevector <4 x i64> %op1, <4 x i64> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -300,6 +441,18 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x half> %res
 }
@@ -313,6 +466,13 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2)  {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x half> %res
 }
@@ -324,6 +484,13 @@ define void @concat_v16f16(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
   %res = shufflevector <8 x half> %op1, <8 x half> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -340,6 +507,14 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = shufflevector <16 x half> %op1, <16 x half> %op2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -363,6 +538,18 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2)  {
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %res
 }
@@ -377,6 +564,13 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2)  {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
@@ -388,6 +582,13 @@ define void @concat_v8f32(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
   %res = shufflevector <4 x float> %op1, <4 x float> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -403,6 +604,14 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = shufflevector <8 x float> %op1, <8 x float> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -425,6 +634,13 @@ define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2)  {
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %res
 }
@@ -436,6 +652,13 @@ define void @concat_v4f64(ptr %a, ptr %b, ptr %c)  {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
   %res = shufflevector <2 x double> %op1, <2 x double> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -451,6 +674,14 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
 ; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = shufflevector <4 x double> %op1, <4 x double> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -468,6 +699,12 @@ define void @concat_v32i8_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i8_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %res = shufflevector <16 x i8> %op1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
@@ -483,6 +720,12 @@ define void @concat_v16i16_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i16_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = shufflevector <8 x i16> %op1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -496,6 +739,12 @@ define void @concat_v8i32_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = shufflevector <4 x i32> %op1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i32> %res, ptr %b
@@ -508,6 +757,12 @@ define void @concat_v4i64_undef(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i64_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %res = shufflevector <2 x i64> %op1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i64> %res, ptr %b
@@ -524,6 +779,16 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v32i8_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -541,6 +806,16 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v16i16_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %res = shufflevector <8 x i16> %shuffle, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -555,6 +830,16 @@ define void @concat_v8i32_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v8i32_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res = shufflevector <4 x i32> %shuffle, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -568,6 +853,16 @@ define void @concat_v4i64_4op(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: concat_v4i64_4op:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x i64>, ptr %a
   %shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
   %res = shufflevector <2 x i64> %shuffle, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 040e5861e9810..2cdd4374a56c5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -11,6 +12,32 @@ define <8 x i16> @load_zext_v8i8i16(ptr %ap)  {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v8i8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i8>, ptr %ap
   %val = zext <8 x i8> %a to <8 x i16>
   ret <8 x i16> %val
@@ -23,6 +50,22 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap)  {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v4i16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = zext <4 x i16> %a to <4 x i32>
   ret <4 x i32> %val
@@ -35,6 +78,19 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) {
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v2i32i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i32>, ptr %ap
   %val = zext <2 x i32> %a to <2 x i64>
   ret <2 x i64> %val
@@ -54,6 +110,20 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) {
 ; CHECK-NEXT:    mov x7, xzr
 ; CHECK-NEXT:    fmov x4, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v2i64i256:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x0, x4, [sp], #16
+; NONEON-NOSVE-NEXT:    mov x1, xzr
+; NONEON-NOSVE-NEXT:    mov x2, xzr
+; NONEON-NOSVE-NEXT:    mov x3, xzr
+; NONEON-NOSVE-NEXT:    mov x5, xzr
+; NONEON-NOSVE-NEXT:    mov x6, xzr
+; NONEON-NOSVE-NEXT:    mov x7, xzr
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = zext <2 x i64> %a to <2 x i256>
   ret <2 x i256> %val
@@ -75,6 +145,79 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $z2
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $z3
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v16i8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i8>, ptr %ap
   %val = sext <16 x i8> %a to <16 x i32>
   ret <16 x i32> %val
@@ -90,6 +233,29 @@ define <8 x i32> @load_sext_v8i16i32(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v8i16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = sext <8 x i16> %a to <8 x i32>
   ret <8 x i32> %val
@@ -121,6 +287,36 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
 ; CHECK-NEXT:    stp x12, x12, [x8, #112]
 ; CHECK-NEXT:    stp x11, x12, [x8, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp x11, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp x12, x13, [sp, #80]
+; NONEON-NOSVE-NEXT:    asr x10, x9, #63
+; NONEON-NOSVE-NEXT:    asr x14, x11, #63
+; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #112]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [x8, #96]
+; NONEON-NOSVE-NEXT:    asr x9, x13, #63
+; NONEON-NOSVE-NEXT:    asr x10, x12, #63
+; NONEON-NOSVE-NEXT:    stp x14, x14, [x8, #80]
+; NONEON-NOSVE-NEXT:    stp x11, x14, [x8, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x9, [x8, #48]
+; NONEON-NOSVE-NEXT:    stp x13, x9, [x8, #32]
+; NONEON-NOSVE-NEXT:    stp x10, x10, [x8, #16]
+; NONEON-NOSVE-NEXT:    stp x12, x10, [x8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = sext <4 x i32> %a to <4 x i256>
   ret <4 x i256> %val
@@ -154,6 +350,30 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
 ; CHECK-NEXT:    fmov x1, d6
 ; CHECK-NEXT:    fmov x5, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_sext_v2i64i256:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    asr x8, x10, #63
+; NONEON-NOSVE-NEXT:    stp x9, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x10, x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x0, x1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x2, x3, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp x4, x5, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp x6, x7, [sp, #112]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = sext <2 x i64> %a to <2 x i256>
   ret <2 x i256> %val
@@ -187,6 +407,92 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap)  {
 ; CHECK-NEXT:    // kill: def $q6 killed $q6 killed $z6
 ; CHECK-NEXT:    // kill: def $q7 killed $q7 killed $z7
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_zext_v16i16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #316]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #300]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #292]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #284]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #276]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #268]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [sp, #288]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #256]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %ap
   %val = zext <16 x i16> %a to <16 x i64>
   ret <16 x i64> %val
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 45a804becbc55..b7b34cfa1517c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -27,6 +28,22 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> %op, i64 4)
   ret <4 x i1> %ret
 }
@@ -54,6 +71,22 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4)
   ret <4 x i8> %ret
 }
@@ -65,6 +98,14 @@ define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> %op, i64 8)
   ret <8 x i8> %ret
 }
@@ -75,6 +116,12 @@ define void @extract_subvector_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %ret = call <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8> %op, i64 16)
   store <16 x i8> %ret, ptr %b
@@ -91,6 +138,15 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2)
   ret <2 x i16> %ret
 }
@@ -102,6 +158,14 @@ define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %op, i64 4)
   ret <4 x i16> %ret
 }
@@ -112,6 +176,12 @@ define void @extract_subvector_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %ret = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> %op, i64 8)
   store <8 x i16> %ret, ptr %b
@@ -127,6 +197,16 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1)
   ret <1 x i32> %ret
 }
@@ -138,6 +218,14 @@ define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %op, i64 2)
   ret <2 x i32> %ret
 }
@@ -148,6 +236,12 @@ define void @extract_subvector_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %ret = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %op, i64 4)
   store <4 x i32> %ret, ptr %b
@@ -163,6 +257,14 @@ define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64> %op, i64 1)
   ret <1 x i64> %ret
 }
@@ -173,6 +275,12 @@ define void @extract_subvector_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %ret = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> %op, i64 2)
   store <2 x i64> %ret, ptr %b
@@ -190,6 +298,16 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2)
   ret <2 x half> %ret
 }
@@ -201,6 +319,14 @@ define <4 x half> @extract_subvector_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <4 x half> @llvm.vector.extract.v4f16.v8f16(<8 x half> %op, i64 4)
   ret <4 x half> %ret
 }
@@ -211,6 +337,12 @@ define void @extract_subvector_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %ret = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> %op, i64 8)
   store <8 x half> %ret, ptr %b
@@ -226,6 +358,16 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1)
   ret <1 x float> %ret
 }
@@ -237,6 +379,14 @@ define <2 x float> @extract_subvector_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> %op, i64 2)
   ret <2 x float> %ret
 }
@@ -247,6 +397,12 @@ define void @extract_subvector_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %ret = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> %op, i64 4)
   store <4 x float> %ret, ptr %b
@@ -262,6 +418,14 @@ define <1 x double> @extract_subvector_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = call <1 x double> @llvm.vector.extract.v1f64.v2f64(<2 x double> %op, i64 1)
   ret <1 x double> %ret
 }
@@ -272,6 +436,12 @@ define void @extract_subvector_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extract_subvector_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %ret = call <2 x double> @llvm.vector.extract.v2f64.v4f64(<4 x double> %op, i64 2)
   store <2 x double> %ret, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
index 9c3b5e14289dc..0a1831a94d8fe 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,6 +16,15 @@ define half @extractelement_v2f16(<2 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x half> %op1, i64 1
   ret half %r
 }
@@ -26,6 +36,15 @@ define half @extractelement_v4f16(<4 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[3]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x half> %op1, i64 3
   ret half %r
 }
@@ -37,6 +56,14 @@ define half @extractelement_v8f16(<8 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, z0.h[7]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <8 x half> %op1, i64 7
   ret half %r
 }
@@ -48,6 +75,15 @@ define half @extractelement_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, z0.h[7]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %r = extractelement <16 x half> %op1, i64 15
   ret half %r
@@ -60,6 +96,15 @@ define float @extractelement_v2f32(<2 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x float> %op1, i64 1
   ret float %r
 }
@@ -71,6 +116,14 @@ define float @extractelement_v4f32(<4 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <4 x float> %op1, i64 3
   ret float %r
 }
@@ -82,6 +135,15 @@ define float @extractelement_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %r = extractelement <8 x float> %op1, i64 7
   ret float %r
@@ -91,6 +153,10 @@ define double @extractelement_v1f64(<1 x double> %op1) {
 ; CHECK-LABEL: extractelement_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <1 x double> %op1, i64 0
   ret double %r
 }
@@ -101,6 +167,14 @@ define double @extractelement_v2f64(<2 x double> %op1) {
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %r = extractelement <2 x double> %op1, i64 1
   ret double %r
 }
@@ -112,6 +186,15 @@ define double @extractelement_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extractelement_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %r = extractelement <4 x double> %op1, i64 3
   ret double %r
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index 21ce689f68e23..a8d01ec7ce0b4 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
@@ -28,6 +29,62 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x half>, ptr %bp
   %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b)
@@ -54,6 +111,106 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x half>, ptr %bp
   %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %b)
@@ -84,6 +241,195 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v16f16_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #126]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #122]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #120]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #118]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #116]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #114]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #108]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #106]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #102]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #100]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #98]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x half>, ptr %ap
   %b = load <16 x half>, ptr %bp
   %r = call <16 x half> @llvm.copysign.v16f16(<16 x half> %a, <16 x half> %b)
@@ -112,6 +458,30 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ldr d1, [x1]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x float>, ptr %bp
   %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b)
@@ -138,6 +508,41 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
@@ -168,6 +573,67 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v8f32_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x float>, ptr %ap
   %b = load <8 x float>, ptr %bp
   %r = call <8 x float> @llvm.copysign.v8f32(<8 x float> %a, <8 x float> %b)
@@ -196,6 +662,29 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
 ; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load <2 x double>, ptr %bp
   %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b)
@@ -226,6 +715,43 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
 ; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b)
@@ -260,6 +786,31 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x double>, ptr %bp
   %tmp0 = fptrunc <2 x double> %b to <2 x float>
@@ -304,6 +855,43 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s2, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %tmp0 = fptrunc <4 x double> %b to <4 x float>
@@ -337,6 +925,31 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load < 2 x float>, ptr %bp
   %tmp0 = fpext <2 x float> %b to <2 x double>
@@ -381,6 +994,45 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z4.d, z4.d, z1.d, z2.d
 ; SVE2-NEXT:    stp q3, q4, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d2, d1, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %tmp0 = fpext <4 x float> %b to <4 x double>
@@ -416,6 +1068,53 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x float>, ptr %bp
   %tmp0 = fptrunc <4 x float> %b to <4 x half>
@@ -471,6 +1170,53 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    str d5, [x0]
 ; SVE2-NEXT:    add sp, sp, #16
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d2, [x0]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst x9, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst x8, #0x8000000000000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x double>, ptr %bp
   %tmp0 = fptrunc <4 x double> %b to <4 x half>
@@ -514,6 +1260,87 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w9, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0x80000000
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x float>, ptr %bp
   %tmp0 = fptrunc <8 x float> %b to <8 x half>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index b0a82e699939f..e84acfc8504a9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,43 @@ define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -30,6 +68,43 @@ define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -43,6 +118,70 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -58,6 +197,131 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fadd <16 x half> %op1, %op2
@@ -74,6 +338,21 @@ define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -87,6 +366,26 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -102,6 +401,43 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fadd <8 x float> %op1, %op2
@@ -118,6 +454,20 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fadd <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -133,6 +483,31 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fadd <4 x double> %op1, %op2
@@ -153,6 +528,43 @@ define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -166,6 +578,43 @@ define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -179,6 +628,70 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -194,6 +707,131 @@ define void @fdiv_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fdiv <16 x half> %op1, %op2
@@ -210,6 +848,21 @@ define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -223,6 +876,26 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -238,6 +911,43 @@ define void @fdiv_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fdiv s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fdiv <8 x float> %op1, %op2
@@ -254,6 +964,20 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fdiv <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -269,6 +993,31 @@ define void @fdiv_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fdiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fdiv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fdiv d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fdiv d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fdiv d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fdiv d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fdiv <4 x double> %op1, %op2
@@ -290,6 +1039,52 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
   ret <2 x half> %res
 }
@@ -304,6 +1099,52 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
   ret <4 x half> %res
 }
@@ -318,6 +1159,88 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
   ret <8 x half> %res
 }
@@ -334,6 +1257,165 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #106]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %op3 = load <16 x half>, ptr %c
@@ -352,6 +1434,23 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
   ret <2 x float> %res
 }
@@ -366,6 +1465,30 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
   ret <4 x float> %res
 }
@@ -382,6 +1505,49 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %op3 = load <8 x float>, ptr %c
@@ -400,6 +1566,23 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
   ret <2 x double> %res
 }
@@ -416,6 +1599,35 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %op3 = load <4 x double>, ptr %c
@@ -437,6 +1649,43 @@ define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -450,6 +1699,43 @@ define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -463,6 +1749,70 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -478,6 +1828,131 @@ define void @fmul_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fmul <16 x half> %op1, %op2
@@ -494,6 +1969,21 @@ define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -507,6 +1997,26 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -522,6 +2032,43 @@ define void @fmul_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmul s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fmul <8 x float> %op1, %op2
@@ -538,6 +2085,20 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fmul <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -553,6 +2114,31 @@ define void @fmul_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmul_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmul d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmul d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmul d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fmul <4 x double> %op1, %op2
@@ -572,6 +2158,34 @@ define <2 x half> @fneg_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x half> %op
   ret <2 x half> %res
 }
@@ -584,6 +2198,34 @@ define <4 x half> @fneg_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x half> %op
   ret <4 x half> %res
 }
@@ -596,6 +2238,54 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <8 x half> %op
   ret <8 x half> %res
 }
@@ -609,6 +2299,96 @@ define void @fneg_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fneg z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x8000
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = fneg <16 x half> %op
   store <16 x half> %res, ptr %a
@@ -623,6 +2403,19 @@ define <2 x float> @fneg_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x float> %op
   ret <2 x float> %res
 }
@@ -635,6 +2428,24 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <4 x float> %op
   ret <4 x float> %res
 }
@@ -648,6 +2459,36 @@ define void @fneg_v8f32(ptr %a) {
 ; CHECK-NEXT:    fneg z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fneg s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fneg s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = fneg <8 x float> %op
   store <8 x float> %res, ptr %a
@@ -662,6 +2503,19 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fneg d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fneg <2 x double> %op
   ret <2 x double> %res
 }
@@ -675,6 +2529,26 @@ define void @fneg_v4f64(ptr %a) {
 ; CHECK-NEXT:    fneg z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fneg_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fneg d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fneg d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fneg d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = fneg <4 x double> %op
   store <4 x double> %res, ptr %a
@@ -693,6 +2567,34 @@ define <2 x half> @fsqrt_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -705,6 +2607,34 @@ define <4 x half> @fsqrt_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -717,6 +2647,54 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -730,6 +2708,96 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsqrt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -744,6 +2812,19 @@ define <2 x float> @fsqrt_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -756,6 +2837,24 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -769,6 +2868,36 @@ define void @fsqrt_v8f32(ptr %a) {
 ; CHECK-NEXT:    fsqrt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fsqrt s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -783,6 +2912,19 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -796,6 +2938,26 @@ define void @fsqrt_v4f64(ptr %a) {
 ; CHECK-NEXT:    fsqrt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsqrt_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsqrt d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsqrt d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fsqrt d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fsqrt d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -815,6 +2977,43 @@ define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x half> %op1, %op2
   ret <2 x half> %res
 }
@@ -828,6 +3027,43 @@ define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x half> %op1, %op2
   ret <4 x half> %res
 }
@@ -841,6 +3077,70 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <8 x half> %op1, %op2
   ret <8 x half> %res
 }
@@ -856,6 +3156,131 @@ define void @fsub_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fsub <16 x half> %op1, %op2
@@ -872,6 +3297,21 @@ define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x float> %op1, %op2
   ret <2 x float> %res
 }
@@ -885,6 +3325,26 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <4 x float> %op1, %op2
   ret <4 x float> %res
 }
@@ -900,6 +3360,43 @@ define void @fsub_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fsub s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fsub <8 x float> %op1, %op2
@@ -916,6 +3413,20 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = fsub <2 x double> %op1, %op2
   ret <2 x double> %res
 }
@@ -931,6 +3442,31 @@ define void @fsub_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fsub_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fsub d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fsub d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fsub d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fsub d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fsub <4 x double> %op1, %op2
@@ -950,6 +3486,34 @@ define <2 x half> @fabs_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.fabs.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -962,6 +3526,34 @@ define <4 x half> @fabs_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -974,6 +3566,54 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -987,6 +3627,96 @@ define void @fabs_v16f16(ptr %a) {
 ; CHECK-NEXT:    fabs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fmov w8, s0
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7fff
+; NONEON-NOSVE-NEXT:    fmov s0, w8
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -1001,6 +3731,19 @@ define <2 x float> @fabs_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -1013,6 +3756,24 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -1026,6 +3787,36 @@ define void @fabs_v8f32(ptr %a) {
 ; CHECK-NEXT:    fabs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fabs s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fabs s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -1040,6 +3831,19 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -1053,6 +3857,26 @@ define void @fabs_v4f64(ptr %a) {
 ; CHECK-NEXT:    fabs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fabs_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fabs d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fabs d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fabs d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
index cbd0ad66fba76..776b6918923ae 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,6 +20,28 @@ define <2 x i16> @fcmp_oeq_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x half> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i16>
   ret <2 x i16> %sext
@@ -34,6 +57,43 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x half> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
   ret <4 x i16> %sext
@@ -49,6 +109,70 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <8 x half> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
   ret <8 x i16> %sext
@@ -66,6 +190,131 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp oeq <16 x half> %op1, %op2
@@ -84,6 +333,22 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x float> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
@@ -99,6 +364,28 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <4 x float> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %sext
@@ -116,6 +403,47 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    csetm w9, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %cmp = fcmp oeq <8 x float> %op1, %op2
@@ -132,6 +460,17 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcmp d0, d1
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <1 x double> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
   ret <1 x i64> %sext
@@ -147,6 +486,21 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp]
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    csetm x9, eq
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %cmp = fcmp oeq <2 x double> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %sext
@@ -164,6 +518,33 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oeq_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    csetm x9, eq
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp]
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    csetm x9, eq
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %cmp = fcmp oeq <4 x double> %op1, %op2
@@ -192,6 +573,147 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ueq_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ueq <16 x half> %op1, %op2
@@ -220,6 +742,147 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_one_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp one <16 x half> %op1, %op2
@@ -244,6 +907,131 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_une_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp une <16 x half> %op1, %op2
@@ -268,6 +1056,131 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ogt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ogt <16 x half> %op1, %op2
@@ -295,6 +1208,131 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ugt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ugt <16 x half> %op1, %op2
@@ -319,6 +1357,131 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_olt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp olt <16 x half> %op1, %op2
@@ -346,6 +1509,131 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ult_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ult <16 x half> %op1, %op2
@@ -370,6 +1658,131 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_oge_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp oge <16 x half> %op1, %op2
@@ -397,6 +1810,131 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_uge_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, pl
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp uge <16 x half> %op1, %op2
@@ -421,6 +1959,131 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ole_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, ls
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ole <16 x half> %op1, %op2
@@ -448,6 +2111,131 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ule_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ule <16 x half> %op1, %op2
@@ -472,6 +2260,131 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_uno_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, vs
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp uno <16 x half> %op1, %op2
@@ -499,6 +2412,131 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ord_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, vc
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp ord <16 x half> %op1, %op2
@@ -523,6 +2561,131 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_eq_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast oeq <16 x half> %op1, %op2
@@ -547,6 +2710,131 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ne_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast one <16 x half> %op1, %op2
@@ -571,6 +2859,131 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_gt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast ogt <16 x half> %op1, %op2
@@ -595,6 +3008,131 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_lt_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast olt <16 x half> %op1, %op2
@@ -619,6 +3157,131 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_ge_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast oge <16 x half> %op1, %op2
@@ -643,6 +3306,131 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcmp_le_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %cmp = fcmp fast ole <16 x half> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
index 57d072a7bcd68..2c08977320e84 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,32 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fp_convert_combine_crash:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0, #3
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w10, s2, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w11, s0, #3
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s3, [sp]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w12, s1, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s2, #3
+; NONEON-NOSVE-NEXT:    stp w11, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s3, #3
+; NONEON-NOSVE-NEXT:    fcvtzs w10, s0, #3
+; NONEON-NOSVE-NEXT:    stp w8, w12, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w10, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %f = load <8 x float>, ptr %a
   %mul.i = fmul <8 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00,
                                  float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 6a2dc3c718252..9878910763a75 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,20 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f16_to_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <2 x half> %a to <2 x float>
   store <2 x float> %res, ptr %b
   ret void
@@ -31,6 +46,26 @@ define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f16_to_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <4 x half> %a to <4 x float>
   store <4 x float> %res, ptr %b
   ret void
@@ -48,6 +83,37 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <8 x half> %a to <8 x float>
   store <8 x float> %res, ptr %b
   ret void
@@ -72,6 +138,61 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) {
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    stp q2, q1, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %res = fpext <16 x half> %a to <16 x float>
   store <16 x float> %res, ptr %b
   ret void
@@ -90,6 +211,24 @@ define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x float>
   store <2 x float> %res, ptr %b
@@ -104,6 +243,27 @@ define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x float>
   store <4 x float> %res, ptr %b
@@ -121,6 +281,38 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -145,6 +337,62 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -162,6 +410,18 @@ define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt d0, h0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f16_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x half>, ptr %a
   %res = fpext <1 x half> %op1 to <1 x double>
   store <1 x double> %res, ptr %b
@@ -176,6 +436,30 @@ define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %res = fpext <2 x half> %op1 to <2 x double>
   store <2 x double> %res, ptr %b
@@ -193,6 +477,39 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fpext <4 x half> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -217,6 +534,65 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -258,6 +634,119 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q4, q0, [x1, #32]
 ; CHECK-NEXT:    stp q1, q2, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s1, h0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #164]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #156]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #152]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #148]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #144]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #140]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #136]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #328]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #188]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #184]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #172]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #168]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -275,6 +764,13 @@ define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt d0, s0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f32_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x float>, ptr %a
   %res = fpext <1 x float> %op1 to <1 x double>
   store <1 x double> %res, ptr %b
@@ -289,6 +785,22 @@ define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fpext <2 x float> %op1 to <2 x double>
   store <2 x double> %res, ptr %b
@@ -306,6 +818,28 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fpext <4 x float> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -330,6 +864,42 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt d1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt d0, s0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fpext <8 x float> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -348,6 +918,25 @@ define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %res = fptrunc <2 x float> %op1 to <2 x half>
   store <2 x half> %res, ptr %b
@@ -362,6 +951,28 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptrunc <4 x float> %op1 to <4 x half>
   store <4 x half> %res, ptr %b
@@ -379,6 +990,40 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptrunc <8 x float> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -397,6 +1042,13 @@ define void @fcvt_v1f64_v1f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <1 x double>, ptr %a
   %res = fptrunc <1 x double> %op1 to <1 x half>
   store <1 x half> %res, ptr %b
@@ -411,6 +1063,24 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %res = fptrunc <2 x double> %op1 to <2 x half>
   store <2 x half> %res, ptr %b
@@ -428,6 +1098,28 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x half>
   store <4 x half> %res, ptr %b
@@ -446,6 +1138,12 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
+; NONEON-NOSVE-NEXT:    str s0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <1 x double> %op1 to <1 x float>
   store <1 x float> %res, ptr %b
   ret void
@@ -459,6 +1157,20 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) {
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptrunc <2 x double> %op1 to <2 x float>
   store <2 x float> %res, ptr %b
   ret void
@@ -475,6 +1187,26 @@ define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1, x8, lsl #2]
 ; CHECK-NEXT:    st1w { z1.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, d0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x float>
   store <4 x float> %res, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index 153a04f486571..680cb4fb0a791 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,63 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fmul s1, s3, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmul s2, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmul s2, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s2, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x half> %op1, %op2
   %res = fadd contract <4 x half> %mul, %op3
   ret <4 x half> %res
@@ -32,6 +90,111 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h23, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s3, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s22, h22
+; NONEON-NOSVE-NEXT:    fcvt s23, h23
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s20, h20
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h19, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #6]
+; NONEON-NOSVE-NEXT:    fmul s5, s1, s3
+; NONEON-NOSVE-NEXT:    fcvt s21, h21
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fmul s3, s4, s3
+; NONEON-NOSVE-NEXT:    fmul s0, s1, s0
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s23, s22
+; NONEON-NOSVE-NEXT:    fcvt h3, s3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    str h2, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s19, s18
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s17, s16
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fmul s5, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt h5, s5
+; NONEON-NOSVE-NEXT:    str h2, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s5, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    str h2, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s2, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    str h1, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <8 x half> %op1, %op2
   %res = fadd contract <8 x half> %mul, %op3
   ret <8 x half> %res
@@ -49,6 +212,232 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #208
+; NONEON-NOSVE-NEXT:    stp d15, d14, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp d13, d12, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp d11, d10, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp d9, d8, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 208
+; NONEON-NOSVE-NEXT:    .cfi_offset b8, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset b9, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset b10, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset b11, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset b12, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset b13, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset b14, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset b15, -64
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q18, q19, [x2]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #78]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #96]
+; NONEON-NOSVE-NEXT:    str q18, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h15, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvt s20, h0
+; NONEON-NOSVE-NEXT:    fcvt s21, h1
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    ldr h13, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h14, [sp, #74]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h12, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h10, [sp, #70]
+; NONEON-NOSVE-NEXT:    fmul s30, s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h31, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h28, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h29, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h26, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h27, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h23, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt h19, s30
+; NONEON-NOSVE-NEXT:    fcvt s30, h15
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s17, h17
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s7, h7
+; NONEON-NOSVE-NEXT:    fcvt s19, h19
+; NONEON-NOSVE-NEXT:    fmul s0, s0, s30
+; NONEON-NOSVE-NEXT:    fcvt s30, h14
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt s5, h5
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmul s16, s17, s16
+; NONEON-NOSVE-NEXT:    fmul s6, s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s18, s19, s18
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s19, h13
+; NONEON-NOSVE-NEXT:    fmul s4, s5, s4
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    ldp d15, d14, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h16, s16
+; NONEON-NOSVE-NEXT:    fcvt h6, s6
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt h4, s4
+; NONEON-NOSVE-NEXT:    fmul s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14] // 2-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt s16, h16
+; NONEON-NOSVE-NEXT:    fcvt s6, h6
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    str h18, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #108]
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fmul s1, s1, s3
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s18
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h11
+; NONEON-NOSVE-NEXT:    fcvt s30, h12
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    ldp d13, d12, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    str h0, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #106]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h9
+; NONEON-NOSVE-NEXT:    fcvt s30, h10
+; NONEON-NOSVE-NEXT:    ldp d11, d10, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h31
+; NONEON-NOSVE-NEXT:    fcvt s30, h8
+; NONEON-NOSVE-NEXT:    ldp d9, d8, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #102]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s30, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h28
+; NONEON-NOSVE-NEXT:    fcvt s28, h29
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #100]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s28, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h26
+; NONEON-NOSVE-NEXT:    fcvt s26, h27
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #98]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s26, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h24
+; NONEON-NOSVE-NEXT:    fcvt s24, h25
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s24, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h22
+; NONEON-NOSVE-NEXT:    fcvt s22, h23
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s22, s19
+; NONEON-NOSVE-NEXT:    fcvt s19, h20
+; NONEON-NOSVE-NEXT:    fcvt s20, h21
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fmul s18, s20, s19
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h18, s18
+; NONEON-NOSVE-NEXT:    str h0, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s18, h18
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s18, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s16, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s6, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s4, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s2, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #208
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %op3 = load <16 x half>, ptr %c
@@ -68,6 +457,23 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x float> %op1, %op2
   %res = fadd contract <2 x float> %mul, %op3
   ret <2 x float> %res
@@ -83,6 +489,30 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <4 x float> %op1, %op2
   %res = fadd contract <4 x float> %mul, %op3
   ret <4 x float> %res
@@ -100,6 +530,49 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fmadd s5, s4, s3, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %op3 = load <8 x float>, ptr %c
@@ -114,6 +587,16 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmadd d0, d0, d1, d2
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmadd d0, d0, d1, d2
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <1 x double> %op1, %op2
   %res = fadd contract <1 x double> %mul, %op3
   ret <1 x double> %res
@@ -129,6 +612,23 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 ; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %mul = fmul contract <2 x double> %op1, %op2
   %res = fadd contract <2 x double> %mul, %op3
   ret <2 x double> %res
@@ -146,6 +646,35 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fma_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x2]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d2, d4, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fmadd d5, d4, d3, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d5, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %op3 = load <4 x double>, ptr %c
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index 6945a6102c055..775cac272cde9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,43 @@ define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -30,6 +68,70 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -45,6 +147,131 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -61,6 +288,21 @@ define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -74,6 +316,26 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -89,6 +351,43 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmaxnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -101,6 +400,16 @@ define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmaxnm d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -114,6 +423,20 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -129,6 +452,31 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmaxnm z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxnm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmaxnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -149,6 +497,43 @@ define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -162,6 +547,70 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -177,6 +626,131 @@ define void @fminnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.minnum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -193,6 +767,21 @@ define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -206,6 +795,26 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -221,6 +830,43 @@ define void @fminnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fminnm s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.minnum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -233,6 +879,16 @@ define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fminnm d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fminnm d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -246,6 +902,20 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -261,6 +931,31 @@ define void @fminnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminnm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fminnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.minnum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -281,6 +976,43 @@ define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -294,6 +1026,70 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -309,6 +1105,131 @@ define void @fmax_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.maximum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -325,6 +1246,21 @@ define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -338,6 +1274,26 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -353,6 +1309,43 @@ define void @fmax_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmax s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.maximum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -365,6 +1358,16 @@ define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmax d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmax d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -378,6 +1381,20 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -393,6 +1410,31 @@ define void @fmax_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmax_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmax d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.maximum.v4f64(<4 x double> %op1, <4 x double> %op2)
@@ -413,6 +1455,43 @@ define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2)
   ret <4 x half> %res
 }
@@ -426,6 +1505,70 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2)
   ret <8 x half> %res
 }
@@ -441,6 +1584,131 @@ define void @fmin_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = call <16 x half> @llvm.minimum.v16f16(<16 x half> %op1, <16 x half> %op2)
@@ -457,6 +1725,21 @@ define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2)
   ret <2 x float> %res
 }
@@ -470,6 +1753,26 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2)
   ret <4 x float> %res
 }
@@ -485,6 +1788,43 @@ define void @fmin_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fmin s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = call <8 x float> @llvm.minimum.v8f32(<8 x float> %op1, <8 x float> %op2)
@@ -497,6 +1837,16 @@ define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmin d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmin d0, d0, d1
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2)
   ret <1 x double> %res
 }
@@ -510,6 +1860,20 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2)
   ret <2 x double> %res
 }
@@ -525,6 +1889,31 @@ define void @fmin_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmin_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fmin d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = call <4 x double> @llvm.minimum.v4f64(<4 x double> %op1, <4 x double> %op2)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
index e239ff5e35fd3..f081d4ac65b27 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible < %s | FileCheck %s -check-prefix=FA64
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -check-prefix=NO-FA64
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -26,6 +27,34 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ; NO-FA64-NEXT:    fadd h0, h0, h2
 ; NO-FA64-NEXT:    fadd h0, h0, h1
 ; NO-FA64-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index 78ae7bb6cf30a..4eaaee7ce5055 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,6 +20,34 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
@@ -43,6 +72,53 @@ define half @fadda_v8f16(half %start, <8 x half> %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
 }
@@ -83,6 +159,97 @@ define half @fadda_v16f16(half %start, ptr %a) {
 ; CHECK-NEXT:    fadd h0, h0, h2
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
   ret half %res
@@ -96,6 +263,17 @@ define float @fadda_v2f32(float %start, <2 x float> %a) {
 ; CHECK-NEXT:    mov z1.s, z1.s[1]
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
 }
@@ -112,6 +290,19 @@ define float @fadda_v4f32(float %start, <4 x float> %a) {
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
 }
@@ -136,6 +327,29 @@ define float @fadda_v8f32(float %start, ptr %a) {
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
   ret float %res
@@ -146,6 +360,11 @@ define double @fadda_v1f64(double %start, <1 x double> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
   ret double %res
 }
@@ -158,6 +377,15 @@ define double @fadda_v2f64(double %start, <2 x double> %a) {
 ; CHECK-NEXT:    mov z1.d, z1.d[1]
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp], #16
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
 }
@@ -174,6 +402,23 @@ define double @fadda_v4f64(double %start, ptr %a) {
 ; CHECK-NEXT:    mov z1.d, z1.d[1]
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadda_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp]
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d2
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
   ret double %res
@@ -191,6 +436,34 @@ define half @faddv_v4f16(half %start, <4 x half> %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
   ret half %res
 }
@@ -203,6 +476,53 @@ define half @faddv_v8f16(half %start, <8 x half> %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
   ret half %res
 }
@@ -216,6 +536,94 @@ define half @faddv_v16f16(half %start, ptr %a) {
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h2, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s3
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt s4, h4
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s3
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt h2, s2
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
   ret half %res
@@ -229,6 +637,17 @@ define float @faddv_v2f32(float %start, <2 x float> %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
   ret float %res
 }
@@ -241,6 +660,18 @@ define float @faddv_v4f32(float %start, <4 x float> %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s4, s3, [sp], #16
+; NONEON-NOSVE-NEXT:    fadd s3, s4, s3
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
   ret float %res
 }
@@ -254,6 +685,26 @@ define float @faddv_v8f32(float %start, ptr %a) {
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s4, s3, [sp]
+; NONEON-NOSVE-NEXT:    ldp s5, s6, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s7, s16, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s1, s3, s1
+; NONEON-NOSVE-NEXT:    fadd s2, s4, s2
+; NONEON-NOSVE-NEXT:    fadd s3, s7, s5
+; NONEON-NOSVE-NEXT:    fadd s4, s16, s6
+; NONEON-NOSVE-NEXT:    fadd s1, s2, s1
+; NONEON-NOSVE-NEXT:    fadd s2, s3, s4
+; NONEON-NOSVE-NEXT:    fadd s1, s1, s2
+; NONEON-NOSVE-NEXT:    fadd s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
   ret float %res
@@ -264,6 +715,11 @@ define double @faddv_v1f64(double %start, <1 x double> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
   ret double %res
 }
@@ -276,6 +732,15 @@ define double @faddv_v2f64(double %start, <2 x double> %a) {
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp], #16
+; NONEON-NOSVE-NEXT:    fadd d1, d2, d1
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
   ret double %res
 }
@@ -289,6 +754,19 @@ define double @faddv_v4f64(double %start, ptr %a) {
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: faddv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d4, d3, [sp], #32
+; NONEON-NOSVE-NEXT:    fadd d1, d3, d1
+; NONEON-NOSVE-NEXT:    fadd d2, d4, d2
+; NONEON-NOSVE-NEXT:    fadd d1, d2, d1
+; NONEON-NOSVE-NEXT:    fadd d0, d0, d1
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
   ret double %res
@@ -306,6 +784,30 @@ define half @fmaxv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -318,6 +820,49 @@ define half @fmaxv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -331,6 +876,90 @@ define half @fmaxv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
   ret half %res
@@ -344,6 +973,16 @@ define float @fmaxv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -356,6 +995,18 @@ define float @fmaxv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -369,6 +1020,25 @@ define float @fmaxv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmaxnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmaxnm s1, s4, s1
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmaxnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
   ret float %res
@@ -378,6 +1048,10 @@ define double @fmaxv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fmaxv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -390,6 +1064,14 @@ define double @fmaxv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -403,6 +1085,18 @@ define double @fmaxv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaxv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d2, d0
+; NONEON-NOSVE-NEXT:    fmaxnm d1, d3, d1
+; NONEON-NOSVE-NEXT:    fmaxnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
   ret double %res
@@ -420,6 +1114,30 @@ define half @fminv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -432,6 +1150,49 @@ define half @fminv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -445,6 +1206,90 @@ define half @fminv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
   ret half %res
@@ -458,6 +1303,16 @@ define float @fminv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -470,6 +1325,18 @@ define float @fminv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -483,6 +1350,25 @@ define float @fminv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fminnm s0, s2, s0
+; NONEON-NOSVE-NEXT:    fminnm s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fminnm s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fminnm s2, s2, s3
+; NONEON-NOSVE-NEXT:    fminnm s1, s4, s1
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s2
+; NONEON-NOSVE-NEXT:    fminnm s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
   ret float %res
@@ -492,6 +1378,10 @@ define double @fminv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fminv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -504,6 +1394,14 @@ define double @fminv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -517,6 +1415,18 @@ define double @fminv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fminnm d0, d2, d0
+; NONEON-NOSVE-NEXT:    fminnm d1, d3, d1
+; NONEON-NOSVE-NEXT:    fminnm d0, d1, d0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
   ret double %res
@@ -534,6 +1444,30 @@ define half @fmaximumv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -546,6 +1480,49 @@ define half @fmaximumv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -559,6 +1536,90 @@ define half @fmaximumv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op)
   ret half %res
@@ -572,6 +1633,16 @@ define float @fmaximumv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -584,6 +1655,18 @@ define float @fmaximumv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -597,6 +1680,25 @@ define float @fmaximumv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fmax s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmax s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmax s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmax s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmax s1, s4, s1
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmax s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op)
   ret float %res
@@ -606,6 +1708,10 @@ define double @fmaximumv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fmaximumv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmaximum.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -618,6 +1724,14 @@ define double @fmaximumv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -631,6 +1745,18 @@ define double @fmaximumv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fmaximumv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fmax d0, d2, d0
+; NONEON-NOSVE-NEXT:    fmax d1, d3, d1
+; NONEON-NOSVE-NEXT:    fmax d0, d1, d0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op)
   ret double %res
@@ -648,6 +1774,30 @@ define half @fminimumv_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a)
   ret half %res
 }
@@ -660,6 +1810,49 @@ define half @fminimumv_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a)
   ret half %res
 }
@@ -673,6 +1866,90 @@ define half @fminimumv_v16f16(ptr %a) {
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s2, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h1, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s2
+; NONEON-NOSVE-NEXT:    ldr h2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s2, h2
+; NONEON-NOSVE-NEXT:    fcvt s3, h3
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s2
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt h1, s1
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op)
   ret half %res
@@ -686,6 +1963,16 @@ define float @fminimumv_v2f32(<2 x float> %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a)
   ret float %res
 }
@@ -698,6 +1985,18 @@ define float @fminimumv_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s2, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
   ret float %res
 }
@@ -711,6 +2010,25 @@ define float @fminimumv_v8f32(ptr %a) {
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s3, s2, [sp]
+; NONEON-NOSVE-NEXT:    fmin s0, s2, s0
+; NONEON-NOSVE-NEXT:    fmin s1, s3, s1
+; NONEON-NOSVE-NEXT:    ldp s2, s4, [sp, #8]
+; NONEON-NOSVE-NEXT:    fmin s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s3, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fmin s2, s2, s3
+; NONEON-NOSVE-NEXT:    fmin s1, s4, s1
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s2
+; NONEON-NOSVE-NEXT:    fmin s0, s0, s1
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op)
   ret float %res
@@ -720,6 +2038,10 @@ define double @fminimumv_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: fminimumv_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fminimum.v1f64(<1 x double> %a)
   ret double %res
 }
@@ -732,6 +2054,14 @@ define double @fminimumv_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp], #16
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    ret
   %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a)
   ret double %res
 }
@@ -745,6 +2075,18 @@ define double @fminimumv_v4f64(ptr %a) {
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fminimumv_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp], #32
+; NONEON-NOSVE-NEXT:    fmin d0, d2, d0
+; NONEON-NOSVE-NEXT:    fmin d1, d3, d1
+; NONEON-NOSVE-NEXT:    fmin d0, d1, d0
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op)
   ret double %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index 412c27cb82f1d..344aac5b19838 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,34 @@ define <2 x half> @frintp_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.ceil.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -28,6 +57,34 @@ define <4 x half> @frintp_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -40,6 +97,54 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -53,6 +158,96 @@ define void @frintp_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintp z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -67,6 +262,19 @@ define <2 x float> @frintp_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -79,6 +287,24 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -92,6 +318,36 @@ define void @frintp_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintp z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintp s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintp s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -103,6 +359,16 @@ define <1 x double> @frintp_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintp d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -115,6 +381,19 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -128,6 +407,26 @@ define void @frintp_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintp z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintp_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintp d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintp d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintp d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -146,6 +445,34 @@ define <2 x half> @frintm_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.floor.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -158,6 +485,34 @@ define <4 x half> @frintm_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -170,6 +525,54 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -183,6 +586,96 @@ define void @frintm_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintm z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -197,6 +690,19 @@ define <2 x float> @frintm_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -209,6 +715,24 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -222,6 +746,36 @@ define void @frintm_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintm z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintm s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintm s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -233,6 +787,16 @@ define <1 x double> @frintm_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintm d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -245,6 +809,19 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -258,6 +835,26 @@ define void @frintm_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintm z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintm d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintm d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintm d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -276,6 +873,34 @@ define <2 x half> @frinti_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.nearbyint.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -288,6 +913,34 @@ define <4 x half> @frinti_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -300,6 +953,54 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -313,6 +1014,96 @@ define void @frinti_v16f16(ptr %a) {
 ; CHECK-NEXT:    frinti z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -327,6 +1118,19 @@ define <2 x float> @frinti_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -339,6 +1143,24 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -352,6 +1174,36 @@ define void @frinti_v8f32(ptr %a) {
 ; CHECK-NEXT:    frinti z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinti s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinti s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -363,6 +1215,16 @@ define <1 x double> @frinti_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frinti d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -375,6 +1237,19 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -388,6 +1263,26 @@ define void @frinti_v4f64(ptr %a) {
 ; CHECK-NEXT:    frinti z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinti_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinti d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinti d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinti d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -406,6 +1301,34 @@ define <2 x half> @frintx_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.rint.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -418,6 +1341,34 @@ define <4 x half> @frintx_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -430,6 +1381,54 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -443,6 +1442,96 @@ define void @frintx_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -457,6 +1546,19 @@ define <2 x float> @frintx_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -469,6 +1571,24 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -482,6 +1602,36 @@ define void @frintx_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintx s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintx s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -493,6 +1643,16 @@ define <1 x double> @frintx_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintx d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -505,6 +1665,19 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -518,6 +1691,26 @@ define void @frintx_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintx_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintx d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintx d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintx d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -536,6 +1729,34 @@ define <2 x half> @frinta_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.round.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -548,6 +1769,34 @@ define <4 x half> @frinta_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.round.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -560,6 +1809,54 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.round.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -573,6 +1870,96 @@ define void @frinta_v16f16(ptr %a) {
 ; CHECK-NEXT:    frinta z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -587,6 +1974,19 @@ define <2 x float> @frinta_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.round.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -599,6 +1999,24 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.round.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -612,6 +2030,36 @@ define void @frinta_v8f32(ptr %a) {
 ; CHECK-NEXT:    frinta z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frinta s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frinta s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -623,6 +2071,16 @@ define <1 x double> @frinta_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frinta d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.round.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -635,6 +2093,19 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.round.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -648,6 +2119,26 @@ define void @frinta_v4f64(ptr %a) {
 ; CHECK-NEXT:    frinta z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frinta_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frinta d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frinta d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frinta d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -666,6 +2157,34 @@ define <2 x half> @frintn_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -678,6 +2197,34 @@ define <4 x half> @frintn_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -690,6 +2237,54 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -703,6 +2298,96 @@ define void @frintn_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintn z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -717,6 +2402,19 @@ define <2 x float> @frintn_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -729,6 +2427,24 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -742,6 +2458,36 @@ define void @frintn_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintn z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintn s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintn s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -753,6 +2499,16 @@ define <1 x double> @frintn_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintn d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -765,6 +2521,19 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -778,6 +2547,26 @@ define void @frintn_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintn z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintn_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintn d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintn d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintn d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
@@ -796,6 +2585,34 @@ define <2 x half> @frintz_v2f16(<2 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x half> @llvm.trunc.v2f16(<2 x half> %op)
   ret <2 x half> %res
 }
@@ -808,6 +2625,34 @@ define <4 x half> @frintz_v4f16(<4 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op)
   ret <4 x half> %res
 }
@@ -820,6 +2665,54 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) {
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op)
   ret <8 x half> %res
 }
@@ -833,6 +2726,96 @@ define void @frintz_v16f16(ptr %a) {
 ; CHECK-NEXT:    frintz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x half>, ptr %a
   %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op)
   store <16 x half> %res, ptr %a
@@ -847,6 +2830,19 @@ define <2 x float> @frintz_v2f32(<2 x float> %op) {
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op)
   ret <2 x float> %res
 }
@@ -859,6 +2855,24 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) {
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op)
   ret <4 x float> %res
 }
@@ -872,6 +2886,36 @@ define void @frintz_v8f32(ptr %a) {
 ; CHECK-NEXT:    frintz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    frintz s1, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    frintz s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x float>, ptr %a
   %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op)
   store <8 x float> %res, ptr %a
@@ -883,6 +2927,16 @@ define <1 x double> @frintz_v1f64(<1 x double> %op) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintz d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op)
   ret <1 x double> %res
 }
@@ -895,6 +2949,19 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) {
 ; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op)
   ret <2 x double> %res
 }
@@ -908,6 +2975,26 @@ define void @frintz_v4f64(ptr %a) {
 ; CHECK-NEXT:    frintz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: frintz_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    frintz d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    frintz d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    frintz d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x double>, ptr %a
   %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op)
   store <4 x double> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
index 89697cde848b5..daa9b51cc827b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,32 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
 }
@@ -32,6 +59,32 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
 }
@@ -48,6 +101,47 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
 }
@@ -67,6 +161,87 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #62]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #60]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #58]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #56]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #52]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #50]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x half>, ptr %a
   %op2 = load volatile <16 x half>, ptr %b
   %sel = select i1 %mask, <16 x half> %op1, <16 x half> %op2
@@ -86,6 +261,22 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
 }
@@ -102,6 +293,27 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
 }
@@ -121,6 +333,47 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcsel s3, s0, s2, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp]
+; NONEON-NOSVE-NEXT:    fcsel s0, s0, s1, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x float>, ptr %a
   %op2 = load volatile <8 x float>, ptr %b
   %sel = select i1 %mask, <8 x float> %op1, <8 x float> %op2
@@ -134,6 +387,17 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
 }
@@ -151,6 +415,21 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel d3, d2, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
 }
@@ -171,6 +450,35 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel d3, d0, d2, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel d3, d0, d2, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x double>, ptr %a
   %op2 = load volatile <4 x double>, ptr %b
   %sel = select i1 %mask, <4 x double> %op1, <4 x double> %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 5840ffb20994c..0d92a6fa0fa28 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,6 +16,30 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -27,6 +52,48 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i16>
   store <8 x i16> %res, ptr %b
@@ -42,6 +109,80 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -61,6 +202,21 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -74,6 +230,29 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -90,6 +269,46 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -114,6 +333,78 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i32>
   store <16 x i32> %res, ptr %b
@@ -130,6 +421,17 @@ define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-NEXT:    fcvtzu x8, h0
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -145,6 +447,22 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -167,6 +485,31 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptoui <4 x half> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -204,6 +547,48 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1, #32]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptoui <8 x half> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -264,6 +649,83 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q5, q2, [x1, #96]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i64>
   store <16 x i64> %res, ptr %b
@@ -282,6 +744,18 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -295,6 +769,24 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -312,6 +804,35 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -336,6 +857,60 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f32_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptoui <16 x float> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -354,6 +929,18 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -366,6 +953,22 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -379,6 +982,32 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -398,6 +1027,17 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f32_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -411,6 +1051,19 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -427,6 +1080,26 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptoui <4 x float> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -451,6 +1124,38 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -468,6 +1173,16 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
 }
@@ -481,6 +1196,18 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -509,6 +1236,31 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -552,6 +1304,53 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 ; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #142]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #138]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -628,6 +1427,94 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q1, q7, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q4, [sp]
+; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #286]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #282]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptoui <16 x double> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -647,6 +1534,16 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
 }
@@ -660,6 +1557,18 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -677,6 +1586,23 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -701,6 +1627,36 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzu w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -719,6 +1675,16 @@ define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -731,6 +1697,18 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) {
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptoui <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -744,6 +1722,24 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzu x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptoui <4 x double> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -762,6 +1758,30 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -774,6 +1794,48 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i16>
   store <8 x i16> %res, ptr %b
@@ -789,6 +1851,80 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -808,6 +1944,21 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -821,6 +1972,29 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x half> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -837,6 +2011,46 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -861,6 +2075,78 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i32>
   store <16 x i32> %res, ptr %b
@@ -877,6 +2163,17 @@ define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-NEXT:    fcvtzs x8, h0
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x half> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -893,6 +2190,22 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x half> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -915,6 +2228,31 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %res = fptosi <4 x half> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -952,6 +2290,48 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1, #32]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fptosi <8 x half> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -1012,6 +2392,83 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q5, q2, [x1, #96]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i64>
   store <16 x i64> %res, ptr %b
@@ -1030,6 +2487,18 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -1043,6 +2512,24 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i16>
   ret <4 x i16> %res
 }
@@ -1060,6 +2547,35 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -1084,6 +2600,60 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f32_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptosi <16 x float> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -1102,6 +2672,18 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -1114,6 +2696,22 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <4 x float> %op1 to <4 x i32>
   ret <4 x i32> %res
 }
@@ -1127,6 +2725,32 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -1146,6 +2770,17 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f32_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    fmov d0, x8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x float> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -1159,6 +2794,19 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x float> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -1175,6 +2823,26 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %res = fptosi <4 x float> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
@@ -1199,6 +2867,38 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, s1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, s0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i64>
   store <8 x i64> %res, ptr %b
@@ -1218,6 +2918,16 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i16>
   ret <1 x i16> %res
 }
@@ -1231,6 +2941,18 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i16>
   ret <2 x i16> %res
 }
@@ -1259,6 +2981,31 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i16>
   ret <4 x i16> %res
@@ -1302,6 +3049,53 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 ; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #142]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #138]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i16>
   ret <8 x i16> %res
@@ -1378,6 +3172,94 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 304
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q1, q7, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q4, [sp]
+; NONEON-NOSVE-NEXT:    stp q5, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #286]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #282]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %res = fptosi <16 x double> %op1 to <16 x i16>
   store <16 x i16> %res, ptr %b
@@ -1397,6 +3279,16 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i32>
   ret <1 x i32> %res
 }
@@ -1410,6 +3302,18 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i32>
   ret <2 x i32> %res
 }
@@ -1427,6 +3331,23 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i32>
   ret <4 x i32> %res
@@ -1451,6 +3372,36 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvtzs w9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs w8, d0
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i32>
   store <8 x i32> %res, ptr %b
@@ -1469,6 +3420,16 @@ define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <1 x double> %op1 to <1 x i64>
   ret <1 x i64> %res
 }
@@ -1481,6 +3442,18 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) {
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = fptosi <2 x double> %op1 to <2 x i64>
   ret <2 x i64> %res
 }
@@ -1494,6 +3467,24 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvtzs x9, d1
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptosi <4 x double> %op1 to <4 x i64>
   store <4 x i64> %res, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index c1c7b5c05f5d5..69661049bcb6f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -27,6 +28,31 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    str w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x half> %op1, <2 x half> %op2
   ret <2 x half> %sel
 }
@@ -45,6 +71,44 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w9, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2
   ret <4 x half> %sel
 }
@@ -64,6 +128,72 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #47]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #45]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w13, w15, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w13, w14, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w12, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2
   ret <8 x half> %sel
 }
@@ -80,6 +210,130 @@ define void @select_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h4, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s2, h0
+; NONEON-NOSVE-NEXT:    fcvt s3, h1
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s6, h4
+; NONEON-NOSVE-NEXT:    fcvt s7, h5
+; NONEON-NOSVE-NEXT:    ldr h19, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s18, h17
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s20, h19
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h25, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    fcvt s2, h16
+; NONEON-NOSVE-NEXT:    ldr h3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h26, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h27, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h28, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h29, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s6
+; NONEON-NOSVE-NEXT:    fcvt s7, h3
+; NONEON-NOSVE-NEXT:    ldr h6, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcsel s1, s5, s4, eq
+; NONEON-NOSVE-NEXT:    fcmp s18, s2
+; NONEON-NOSVE-NEXT:    fcvt s4, h6
+; NONEON-NOSVE-NEXT:    fcvt s18, h21
+; NONEON-NOSVE-NEXT:    ldr h5, [sp, #28]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    fcsel s2, s17, s16, eq
+; NONEON-NOSVE-NEXT:    fcmp s20, s7
+; NONEON-NOSVE-NEXT:    fcvt s16, h5
+; NONEON-NOSVE-NEXT:    fcvt s17, h22
+; NONEON-NOSVE-NEXT:    ldr h7, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #68]
+; NONEON-NOSVE-NEXT:    fcsel s3, s19, s3, eq
+; NONEON-NOSVE-NEXT:    fcmp s18, s4
+; NONEON-NOSVE-NEXT:    fcvt s19, h7
+; NONEON-NOSVE-NEXT:    fcvt s23, h20
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #70]
+; NONEON-NOSVE-NEXT:    fcsel s4, s21, s6, eq
+; NONEON-NOSVE-NEXT:    fcmp s17, s16
+; NONEON-NOSVE-NEXT:    fcvt s17, h18
+; NONEON-NOSVE-NEXT:    fcvt s21, h24
+; NONEON-NOSVE-NEXT:    ldr h16, [sp, #50]
+; NONEON-NOSVE-NEXT:    str h3, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcsel s5, s22, s5, eq
+; NONEON-NOSVE-NEXT:    fcmp s23, s19
+; NONEON-NOSVE-NEXT:    fcvt s22, h16
+; NONEON-NOSVE-NEXT:    fcvt s23, h25
+; NONEON-NOSVE-NEXT:    ldr h19, [sp, #52]
+; NONEON-NOSVE-NEXT:    str h4, [sp, #74]
+; NONEON-NOSVE-NEXT:    fcsel s6, s20, s7, eq
+; NONEON-NOSVE-NEXT:    fcmp s21, s17
+; NONEON-NOSVE-NEXT:    fcvt s20, h19
+; NONEON-NOSVE-NEXT:    fcvt s21, h26
+; NONEON-NOSVE-NEXT:    ldr h17, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h5, [sp, #76]
+; NONEON-NOSVE-NEXT:    fcsel s7, s24, s18, eq
+; NONEON-NOSVE-NEXT:    fcmp s23, s22
+; NONEON-NOSVE-NEXT:    fcvt s22, h17
+; NONEON-NOSVE-NEXT:    fcvt s23, h27
+; NONEON-NOSVE-NEXT:    ldr h18, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #40]
+; NONEON-NOSVE-NEXT:    str h6, [sp, #78]
+; NONEON-NOSVE-NEXT:    fcsel s16, s25, s16, eq
+; NONEON-NOSVE-NEXT:    fcmp s21, s20
+; NONEON-NOSVE-NEXT:    fcvt s21, h18
+; NONEON-NOSVE-NEXT:    fcvt s25, h24
+; NONEON-NOSVE-NEXT:    ldr h20, [sp, #58]
+; NONEON-NOSVE-NEXT:    str h7, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcsel s19, s26, s19, eq
+; NONEON-NOSVE-NEXT:    fcmp s23, s22
+; NONEON-NOSVE-NEXT:    fcvt s23, h20
+; NONEON-NOSVE-NEXT:    fcvt s26, h28
+; NONEON-NOSVE-NEXT:    ldr h22, [sp, #60]
+; NONEON-NOSVE-NEXT:    str h16, [sp, #82]
+; NONEON-NOSVE-NEXT:    fcsel s17, s27, s17, eq
+; NONEON-NOSVE-NEXT:    fcmp s25, s21
+; NONEON-NOSVE-NEXT:    fcvt s25, h22
+; NONEON-NOSVE-NEXT:    fcvt s27, h29
+; NONEON-NOSVE-NEXT:    ldr h21, [sp, #62]
+; NONEON-NOSVE-NEXT:    str h19, [sp, #84]
+; NONEON-NOSVE-NEXT:    fcsel s18, s24, s18, eq
+; NONEON-NOSVE-NEXT:    ldr h24, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcmp s26, s23
+; NONEON-NOSVE-NEXT:    fcvt s23, h21
+; NONEON-NOSVE-NEXT:    str h17, [sp, #86]
+; NONEON-NOSVE-NEXT:    fcvt s26, h24
+; NONEON-NOSVE-NEXT:    fcsel s20, s28, s20, eq
+; NONEON-NOSVE-NEXT:    fcmp s27, s25
+; NONEON-NOSVE-NEXT:    ldr h25, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h27, [sp]
+; NONEON-NOSVE-NEXT:    str h18, [sp, #88]
+; NONEON-NOSVE-NEXT:    fcvt s17, h25
+; NONEON-NOSVE-NEXT:    fcvt s18, h27
+; NONEON-NOSVE-NEXT:    fcsel s7, s29, s22, eq
+; NONEON-NOSVE-NEXT:    fcmp s26, s23
+; NONEON-NOSVE-NEXT:    str h20, [sp, #90]
+; NONEON-NOSVE-NEXT:    fcsel s16, s24, s21, eq
+; NONEON-NOSVE-NEXT:    str h7, [sp, #92]
+; NONEON-NOSVE-NEXT:    fcmp s18, s17
+; NONEON-NOSVE-NEXT:    str h16, [sp, #94]
+; NONEON-NOSVE-NEXT:    fcsel s2, s27, s25, eq
+; NONEON-NOSVE-NEXT:    str h2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %mask = fcmp oeq <16 x half> %op1, %op2
@@ -102,6 +356,26 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    sbfx w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2
   ret <2 x float> %sel
 }
@@ -121,6 +395,40 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    sbfx w9, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcsel s3, s2, s0, ne
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, ne
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2
   ret <4 x float> %sel
 }
@@ -137,6 +445,49 @@ define void @select_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr s4, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr s17, [sp]
+; NONEON-NOSVE-NEXT:    ldp s6, s7, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcmp s1, s0
+; NONEON-NOSVE-NEXT:    fcsel s0, s1, s0, eq
+; NONEON-NOSVE-NEXT:    fcmp s3, s2
+; NONEON-NOSVE-NEXT:    ldp s1, s5, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcsel s2, s3, s2, eq
+; NONEON-NOSVE-NEXT:    ldp s16, s3, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcmp s4, s1
+; NONEON-NOSVE-NEXT:    fcsel s1, s4, s1, eq
+; NONEON-NOSVE-NEXT:    fcmp s5, s3
+; NONEON-NOSVE-NEXT:    ldr s4, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcsel s3, s5, s3, eq
+; NONEON-NOSVE-NEXT:    fcmp s6, s4
+; NONEON-NOSVE-NEXT:    ldr s5, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp s2, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    fcsel s4, s6, s4, eq
+; NONEON-NOSVE-NEXT:    fcmp s7, s5
+; NONEON-NOSVE-NEXT:    ldr s6, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcsel s5, s7, s5, eq
+; NONEON-NOSVE-NEXT:    fcmp s16, s6
+; NONEON-NOSVE-NEXT:    ldr s7, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s3, s4, [sp, #80]
+; NONEON-NOSVE-NEXT:    fcsel s6, s16, s6, eq
+; NONEON-NOSVE-NEXT:    fcmp s17, s7
+; NONEON-NOSVE-NEXT:    fcsel s3, s17, s7, eq
+; NONEON-NOSVE-NEXT:    stp s5, s6, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp s3, s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %mask = fcmp oeq <8 x float> %op1, %op2
@@ -151,6 +502,17 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    fcsel d0, d0, d1, ne
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2
   ret <1 x double> %sel
 }
@@ -170,6 +532,27 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    sbfx x8, x8, #0, #1
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    sbfx x8, x9, #0, #1
+; NONEON-NOSVE-NEXT:    fcsel d3, d2, d0, ne
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, ne
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2
   ret <2 x double> %sel
 }
@@ -186,6 +569,33 @@ define void @select_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.d, p0, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d5, d1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d0, d3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp d4, d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp d1, d0
+; NONEON-NOSVE-NEXT:    fcsel d0, d1, d0, eq
+; NONEON-NOSVE-NEXT:    fcmp d3, d2
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcsel d2, d3, d2, eq
+; NONEON-NOSVE-NEXT:    fcmp d4, d1
+; NONEON-NOSVE-NEXT:    ldr d3, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcsel d1, d4, d1, eq
+; NONEON-NOSVE-NEXT:    fcmp d5, d3
+; NONEON-NOSVE-NEXT:    fcsel d3, d5, d3, eq
+; NONEON-NOSVE-NEXT:    stp d2, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d3, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %mask = fcmp oeq <4 x double> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index ff38db8c10c04..3ba61c3335a64 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -21,6 +22,25 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i8> %op1, i8 5, i64 3
     ret <4 x i8> %r
 }
@@ -38,6 +58,27 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i8> %op1, i8 5, i64 7
     ret <8 x i8> %r
 }
@@ -55,6 +96,29 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i8> %op1, i8 5, i64 15
     ret <16 x i8> %r
 }
@@ -72,6 +136,29 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) {
 ; CHECK-NEXT:    mov z1.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <32 x i8> %op1, i8 5, i64 31
     ret <32 x i8> %r
 }
@@ -90,6 +177,22 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i16> %op1, i16 5, i64 1
     ret <2 x i16> %r
 }
@@ -107,6 +210,25 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i16> %op1, i16 5, i64 3
     ret <4 x i16> %r
 }
@@ -124,6 +246,27 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x i16> %op1, i16 5, i64 7
     ret <8 x i16> %r
 }
@@ -141,6 +284,27 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) {
 ; CHECK-NEXT:    mov z1.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <16 x i16> %op1, i16 5, i64 15
     ret <16 x i16> %r
 }
@@ -159,6 +323,22 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i32> %op1, i32 5, i64 1
     ret <2 x i32> %r
 }
@@ -176,6 +356,24 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x i32> %op1, i32 5, i64 3
     ret <4 x i32> %r
 }
@@ -193,6 +391,24 @@ define <8 x i32> @insertelement_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x i32>, ptr %a
     %r = insertelement <8 x i32> %op1, i32 5, i64 7
     ret <8 x i32> %r
@@ -205,6 +421,16 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    mov z0.d, #5 // =0x5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x i64> %op1, i64 5, i64 0
     ret <1 x i64> %r
 }
@@ -222,6 +448,22 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x i64> %op1, i64 5, i64 1
     ret <2 x i64> %r
 }
@@ -239,6 +481,22 @@ define <4 x i64> @insertelement_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/m, x8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #5 // =0x5
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x i64>, ptr %a
     %r = insertelement <4 x i64> %op1, i64 5, i64 3
     ret <4 x i64> %r
@@ -257,6 +515,19 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI14_0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [x8, :lo12:.LCPI14_0]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x half> %op1, half 5.0, i64 1
     ret <2 x half> %r
 }
@@ -274,6 +545,26 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI15_0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [x8, :lo12:.LCPI15_0]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x half> %op1, half 5.0, i64 3
     ret <4 x half> %r
 }
@@ -291,6 +582,28 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) {
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI16_0
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [x8, :lo12:.LCPI16_0]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <8 x half> %op1, half 5.0, i64 7
     ret <8 x half> %r
 }
@@ -308,6 +621,28 @@ define <16 x half> @insertelement_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z1.h, p0/m, h2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI17_0
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldr h1, [x8, :lo12:.LCPI17_0]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <16 x half>, ptr %a
     %r = insertelement <16 x half> %op1, half 5.0, i64 15
     ret <16 x half> %r
@@ -327,6 +662,22 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    mov w8, #1084227584 // =0x40a00000
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x float> %op1, float 5.0, i64 1
     ret <2 x float> %r
 }
@@ -344,6 +695,24 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) {
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #1084227584 // =0x40a00000
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <4 x float> %op1, float 5.0, i64 3
     ret <4 x float> %r
 }
@@ -361,6 +730,25 @@ define <8 x float> @insertelement_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/m, s2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov w8, #1084227584 // =0x40a00000
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <8 x float>, ptr %a
     %r = insertelement <8 x float> %op1, float 5.0, i64 7
     ret <8 x float> %r
@@ -372,6 +760,16 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, #5.00000000
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <1 x double> %op1, double 5.0, i64 0
     ret <1 x double> %r
 }
@@ -389,6 +787,22 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) {
 ; CHECK-NEXT:    mov z0.d, p0/m, d1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %r = insertelement <2 x double> %op1, double 5.0, i64 1
     ret <2 x double> %r
 }
@@ -406,6 +820,23 @@ define <4 x double> @insertelement_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/m, d2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: insertelement_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
     %op1 = load <4 x double>, ptr %a
     %r = insertelement <4 x double> %op1, double 5.0, i64 3
     ret <4 x double> %r
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index ee1706bc7c354..a2875ffef2e88 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,31 @@ define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -28,6 +54,47 @@ define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -40,6 +107,78 @@ define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = add <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -53,6 +192,147 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = add <32 x i8> %op1, %op2
@@ -68,6 +348,22 @@ define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -80,6 +376,31 @@ define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -92,6 +413,46 @@ define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = add <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -105,6 +466,83 @@ define void @add_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = add <16 x i16> %op1, %op2
@@ -120,6 +558,22 @@ define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -132,6 +586,28 @@ define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = add <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -145,6 +621,47 @@ define void @add_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.s, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = add <8 x i32> %op1, %op2
@@ -160,6 +677,18 @@ define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = add <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -172,6 +701,21 @@ define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = add <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -185,6 +729,33 @@ define void @add_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = add <4 x i64> %op1, %op2
@@ -213,6 +784,31 @@ define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -234,6 +830,47 @@ define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -255,6 +892,78 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    mul z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -279,6 +988,147 @@ define void @mul_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = mul <32 x i8> %op1, %op2
@@ -303,6 +1153,21 @@ define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -324,6 +1189,31 @@ define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -345,6 +1235,46 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -369,6 +1299,83 @@ define void @mul_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = mul <16 x i16> %op1, %op2
@@ -393,6 +1400,21 @@ define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -414,6 +1436,26 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -438,6 +1480,43 @@ define void @mul_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    mul w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = mul <8 x i32> %op1, %op2
@@ -462,6 +1541,18 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    mul z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -483,6 +1574,20 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    mul z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = mul <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -507,6 +1612,31 @@ define void @mul_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    mul z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    mul x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = mul <4 x i64> %op1, %op2
@@ -526,6 +1656,31 @@ define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -538,6 +1693,47 @@ define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -550,6 +1746,78 @@ define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    sub z0.b, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -563,6 +1831,147 @@ define void @sub_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = sub <32 x i8> %op1, %op2
@@ -578,6 +1987,22 @@ define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -590,6 +2015,31 @@ define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -602,6 +2052,46 @@ define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    sub z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -615,6 +2105,83 @@ define void @sub_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = sub <16 x i16> %op1, %op2
@@ -630,6 +2197,22 @@ define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -642,6 +2225,28 @@ define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -655,6 +2260,47 @@ define void @sub_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.s, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = sub <8 x i32> %op1, %op2
@@ -670,6 +2316,18 @@ define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    sub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -682,6 +2340,21 @@ define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    sub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = sub <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -695,6 +2368,33 @@ define void @sub_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sub z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sub x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = sub <4 x i64> %op1, %op2
@@ -715,6 +2415,30 @@ define <4 x i8> @abs_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cneg w8, w9, mi
+; NONEON-NOSVE-NEXT:    cmp w10, #0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cneg w8, w10, mi
+; NONEON-NOSVE-NEXT:    cmp w11, #0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cneg w8, w11, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false)
   ret <4 x i8> %res
 }
@@ -727,6 +2451,46 @@ define <8 x i8> @abs_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
   ret <8 x i8> %res
 }
@@ -739,6 +2503,78 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
   ret <16 x i8> %res
 }
@@ -752,6 +2588,144 @@ define void @abs_v32i8(ptr %a) {
 ; CHECK-NEXT:    abs z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
   store <32 x i8> %res, ptr %a
@@ -767,6 +2741,21 @@ define <2 x i16> @abs_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w9, mi
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false)
   ret <2 x i16> %res
 }
@@ -779,6 +2768,30 @@ define <4 x i16> @abs_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
   ret <4 x i16> %res
 }
@@ -791,6 +2804,46 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
   ret <8 x i16> %res
 }
@@ -804,6 +2857,80 @@ define void @abs_v16i16(ptr %a) {
 ; CHECK-NEXT:    abs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
   store <16 x i16> %res, ptr %a
@@ -818,6 +2945,21 @@ define <2 x i32> @abs_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   ret <2 x i32> %res
 }
@@ -830,6 +2972,28 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   ret <4 x i32> %res
 }
@@ -843,6 +3007,44 @@ define void @abs_v8i32(ptr %a) {
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
   store <8 x i32> %res, ptr %a
@@ -857,6 +3059,18 @@ define <1 x i64> @abs_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
   ret <1 x i64> %res
 }
@@ -869,6 +3083,21 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   ret <2 x i64> %res
 }
@@ -882,6 +3111,30 @@ define void @abs_v4i64(ptr %a) {
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
index c2f3bbfb51dd5..0b4316686fff6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,55 @@ define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i8> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i8>
   ret <8 x i8> %sext
@@ -33,6 +83,94 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <16 x i8> %op1, %op2
   %sext = sext <16 x i1> %cmp to <16 x i8>
   ret <16 x i8> %sext
@@ -50,6 +188,179 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %cmp = icmp eq <32 x i8> %op1, %op2
@@ -68,6 +379,35 @@ define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i16> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i16>
   ret <4 x i16> %sext
@@ -83,6 +423,54 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <8 x i16> %op1, %op2
   %sext = sext <8 x i1> %cmp to <8 x i16>
   ret <8 x i16> %sext
@@ -100,6 +488,99 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %cmp = icmp eq <16 x i16> %op1, %op2
@@ -118,6 +599,23 @@ define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i32> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
@@ -133,6 +631,30 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <4 x i32> %op1, %op2
   %sext = sext <4 x i1> %cmp to <4 x i32>
   ret <4 x i32> %sext
@@ -150,6 +672,51 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %cmp = icmp eq <8 x i32> %op1, %op2
@@ -168,6 +735,19 @@ define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <1 x i64> %op1, %op2
   %sext = sext <1 x i1> %cmp to <1 x i64>
   ret <1 x i64> %sext
@@ -183,6 +763,22 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, eq
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %cmp = icmp eq <2 x i64> %op1, %op2
   %sext = sext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %sext
@@ -200,6 +796,35 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm x10, eq
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, eq
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, eq
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %cmp = icmp eq <4 x i64> %op1, %op2
@@ -224,6 +849,179 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ne_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %cmp = icmp ne <32 x i8> %op1, %op2
@@ -246,6 +1044,57 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sge_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, ge
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %cmp = icmp sge <8 x i16> %op1, %op2
@@ -270,6 +1119,99 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sgt_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %cmp = icmp sgt <16 x i16> %op1, %op2
@@ -292,6 +1234,33 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sle_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, le
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, le
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, le
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
   %cmp = icmp sle <4 x i32> %op1, %op2
@@ -316,6 +1285,51 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_slt_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csetm w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %cmp = icmp slt <8 x i32> %op1, %op2
@@ -338,6 +1352,25 @@ define void @icmp_uge_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_uge_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, hs
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, hs
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp uge <2 x i64> %op1, %op2
@@ -360,6 +1393,25 @@ define void @icmp_ugt_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ugt_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, hi
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ugt <2 x i64> %op1, %op2
@@ -382,6 +1434,25 @@ define void @icmp_ule_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ule_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, ls
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, ls
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ule <2 x i64> %op1, %op2
@@ -404,6 +1475,25 @@ define void @icmp_ult_v2i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ult_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x10, lo
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csetm x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
   %cmp = icmp ult <2 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index e6fd775b4cfb9..e09b1613a54af 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,6 +25,31 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -51,6 +77,47 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -98,6 +165,78 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -178,6 +317,147 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
 ; CHECK-NEXT:    stp q3, q2, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = sdiv <32 x i8> %op1, %op2
@@ -196,6 +476,22 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -212,6 +508,31 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -238,6 +559,46 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -278,6 +639,83 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = sdiv <16 x i16> %op1, %op2
@@ -294,6 +732,21 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -307,6 +760,26 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -322,6 +795,43 @@ define void @sdiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = sdiv <8 x i32> %op1, %op2
@@ -338,6 +848,18 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -351,6 +873,20 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -366,6 +902,31 @@ define void @sdiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    sdiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = sdiv <4 x i64> %op1, %op2
@@ -391,6 +952,31 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w11, w12, w11
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -418,6 +1004,47 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -465,6 +1092,78 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -545,6 +1244,147 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
 ; CHECK-NEXT:    stp q3, q2, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = udiv <32 x i8> %op1, %op2
@@ -563,6 +1403,22 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w9, w10, w9
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -579,6 +1435,31 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -605,6 +1486,46 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -645,6 +1566,83 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
 ; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = udiv <16 x i16> %op1, %op2
@@ -661,6 +1659,21 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -674,6 +1687,26 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -689,6 +1722,43 @@ define void @udiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    udiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = udiv <8 x i32> %op1, %op2
@@ -705,6 +1775,18 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -718,6 +1800,20 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = udiv <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -733,6 +1829,31 @@ define void @udiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-NEXT:    udiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = udiv <4 x i64> %op1, %op2
@@ -778,6 +1899,70 @@ define void @udiv_constantsplat_v8i32(ptr %a)  {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #6
 ; SVE2-NEXT:    stp q1, q0, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #8969 // =0x2309
+; NONEON-NOSVE-NEXT:    movk w8, #22765, lsl #16
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #6
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    umull x10, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w9, w10, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w11, w9, #6
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    umull x8, w9, w8
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    sub w9, w9, w8
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #1
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #6
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
   store <8 x i32> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index e40668a8696ee..2c2b79121ef82 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -26,6 +27,54 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) {
 ; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i1_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #34]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #38]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    sbfx w8, w14, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w15, #0, #1
+; NONEON-NOSVE-NEXT:    stp w8, w12, [sp, #64]
+; NONEON-NOSVE-NEXT:    sbfx w12, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    stp w12, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i1> %a to <8 x i32>
   store <8 x i32> %b, ptr %out
   ret void
@@ -52,6 +101,26 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) {
 ; CHECK-NEXT:    asr z0.d, z0.d, #61
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i3_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx x8, x8, #0, #3
+; NONEON-NOSVE-NEXT:    sbfx x9, x9, #0, #3
+; NONEON-NOSVE-NEXT:    sbfx x10, x10, #0, #3
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sbfx x8, x11, #0, #3
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i3> %a to <4 x i64>
   store <4 x i64> %b, ptr %out
   ret void
@@ -70,6 +139,49 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i8_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
   ret void
@@ -91,6 +203,210 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v32i8_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i16>
@@ -112,6 +428,46 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i8_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -133,6 +489,79 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i8_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
   ret void
@@ -167,6 +596,284 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v32i8_v32i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #464
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #448] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 464
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #448] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #304]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #464
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i32>
@@ -194,6 +901,24 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i8_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ushll v0.4s, v0.4h, #0
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb x8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb x11, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp x10, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -216,6 +941,61 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i8_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #176
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 176
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x8, sp, #144
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x9, x10, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x8]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #176
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -253,6 +1033,113 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q1, q4, [x0, #32]
 ; CHECK-NEXT:    stp q0, q2, [x0, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i8_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrsw x9, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #360]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
   ret void
@@ -321,6 +1208,371 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q0, q2, [x1, #224]
 ; CHECK-NEXT:    stp q3, q1, [x1, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v32i8_v32i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #752
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 848
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #272]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrsw x9, [sp, #372]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #256]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #352]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #480]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #404]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #568]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #400]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #412]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #448]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #584]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #408]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #560]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #600]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #416]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #428]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #424]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #592]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #380]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #520]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #376]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #388]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [sp, #496]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #384]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #528]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #396]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #392]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #528]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #696]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #464]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #476]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #712]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #472]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #484]
+; NONEON-NOSVE-NEXT:    ldp q16, q17, [sp, #688]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #728]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #480]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #492]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #744]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #488]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #436]
+; NONEON-NOSVE-NEXT:    ldp q19, q20, [sp, #720]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #632]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #432]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #444]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #648]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #440]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #452]
+; NONEON-NOSVE-NEXT:    ldp q22, q23, [sp, #624]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #664]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #448]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #656]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #460]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #456]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldp q21, q18, [sp, #656]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q16, q17, [x1, #128]
+; NONEON-NOSVE-NEXT:    stp q19, q20, [x1, #160]
+; NONEON-NOSVE-NEXT:    stp q22, q23, [x1, #192]
+; NONEON-NOSVE-NEXT:    stp q21, q18, [x1, #224]
+; NONEON-NOSVE-NEXT:    add sp, sp, #752
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = sext <32 x i8> %b to <32 x i64>
@@ -341,6 +1593,29 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -361,6 +1636,95 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = sext <16 x i16> %b to <16 x i32>
@@ -382,6 +1746,28 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -403,6 +1789,43 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -437,6 +1860,128 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v16i16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #200]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrsw x9, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrsw x8, [sp, #360]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = sext <16 x i16> %b to <16 x i64>
@@ -457,6 +2002,21 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v4i32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %b = sext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -477,6 +2037,47 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sext_v8i32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
   %c = sext <8 x i32> %b to <8 x i64>
@@ -497,6 +2098,49 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i8_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i16>
   store <16 x i16>%b, ptr %out
   ret void
@@ -518,6 +2162,210 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v32i8_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i16>
@@ -539,6 +2387,46 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i8_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -560,6 +2448,79 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i8_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
   ret void
@@ -594,6 +2555,284 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v32i8_v32i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #464
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #448] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 464
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #448] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #304]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #464
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i32>
@@ -619,6 +2858,30 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v4i8_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i8> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -641,6 +2904,65 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i8_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #176
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 176
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x8, sp, #144
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w10, wzr, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x8]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #176
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i8> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -678,6 +3000,133 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    stp q1, q4, [x0, #32]
 ; CHECK-NEXT:    stp q0, q2, [x0, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i8_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #332]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #324]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #348]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #340]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #300]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #292]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #316]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #308]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #268]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #260]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #284]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #276]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #288]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #252]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp wzr, w9, [sp, #244]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x0, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
   ret void
@@ -746,6 +3195,404 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q0, q2, [x1, #224]
 ; CHECK-NEXT:    stp q3, q1, [x1, #96]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v32i8_v32i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #752
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 848
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #572]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #564]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #588]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    add w18, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #31]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w17, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #63]
+; NONEON-NOSVE-NEXT:    add w17, w30, w30
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #61]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #580]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #604]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #596]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #620]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #612]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #508]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #500]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #524]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #516]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #540]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #105]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #532]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #111]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #556]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #548]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #109]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #700]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #692]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #716]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #708]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #97]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #732]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #103]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #724]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #748]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #101]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #740]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #636]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #628]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #652]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #196]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #644]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #272]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #668]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #660]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #204]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #684]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #288]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #676]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #256]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #230]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #228]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #236]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #352]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #480]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #212]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #220]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #404]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #568]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #400]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #412]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #448]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #584]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #408]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #560]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #600]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #428]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #424]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #372]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #592]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #504]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #380]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #376]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #388]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [sp, #496]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #384]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #528]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #396]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #392]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #528]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #696]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #712]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #484]
+; NONEON-NOSVE-NEXT:    ldp q16, q17, [sp, #688]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #728]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #492]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #744]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #488]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    ldp q19, q20, [sp, #720]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #632]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #648]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    ldp q22, q23, [sp, #624]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #664]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #656]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldp q21, q18, [sp, #656]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q4, q5, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #96]
+; NONEON-NOSVE-NEXT:    stp q16, q17, [x1, #128]
+; NONEON-NOSVE-NEXT:    stp q19, q20, [x1, #160]
+; NONEON-NOSVE-NEXT:    stp q22, q23, [x1, #192]
+; NONEON-NOSVE-NEXT:    stp q21, q18, [x1, #224]
+; NONEON-NOSVE-NEXT:    add sp, sp, #752
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
   %c = zext <32 x i8> %b to <32 x i64>
@@ -766,6 +3613,29 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i16_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i32>
   store <8 x i32>%b, ptr %out
   ret void
@@ -786,6 +3656,95 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i16_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = zext <16 x i16> %b to <16 x i32>
@@ -807,6 +3766,30 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v4i16_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i16> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -828,6 +3811,47 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i16_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-160]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
   ret void
@@ -862,6 +3886,148 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v16i16_v16i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #368
+; NONEON-NOSVE-NEXT:    str x29, [sp, #352] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 368
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #268]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #260]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #352] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #284]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w13, w13, w13
+; NONEON-NOSVE-NEXT:    add w14, w14, w14
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w14, w3, w3
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w13, w5, w5
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w14, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w13, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w14, w0, w0
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w13, w18, w18
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w14, w17, w17
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w12, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w11, w11, w11
+; NONEON-NOSVE-NEXT:    add w10, w10, w10
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w13, w16, w16
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w14, w15, w15
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #276]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #324]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #348]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #340]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    stp wzr, w8, [sp, #252]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #200]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #292]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #316]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #308]
+; NONEON-NOSVE-NEXT:    stp wzr, w9, [sp, #244]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldp q3, q4, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #364]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #360]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #368
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
   %c = zext <16 x i16> %b to <16 x i64>
@@ -882,6 +4048,23 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v4i32_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %b = zext <4 x i32> %a to <4 x i64>
   store <4 x i64>%b, ptr %out
   ret void
@@ -902,6 +4085,51 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zext_v8i32_v8i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w9, wzr, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, wzr, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
   %c = zext <8 x i32> %b to <8 x i64>
@@ -928,6 +4156,21 @@ define void @extend_and_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ; SVE2-NEXT:    mul z0.d, z1.d, z0.d
 ; SVE2-NEXT:    str q0, [x1]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extend_and_mul:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov w9, w0
+; NONEON-NOSVE-NEXT:    mul x10, x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    mul x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
   %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer
   %4 = zext <2 x i32> %broadcast.splat3 to <2 x i64>
@@ -943,6 +4186,16 @@ define void @extend_no_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: extend_no_mul:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    mov w8, w0
+; NONEON-NOSVE-NEXT:    stp x8, x8, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0
   %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
index 54276bb4ba01d..1f5bb5f5486af 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -22,6 +23,112 @@ define void @add_v32i8(ptr %a) {
 ; CHECK-NEXT:    add z1.b, z1.b, #7 // =0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -38,6 +145,64 @@ define void @add_v16i16(ptr %a) {
 ; CHECK-NEXT:    add z1.h, z1.h, #15 // =0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -54,6 +219,36 @@ define void @add_v8i32(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z1.s, #31 // =0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -70,6 +265,26 @@ define void @add_v4i64(ptr %a) {
 ; CHECK-NEXT:    add z1.d, z1.d, #63 // =0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    add x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -90,6 +305,112 @@ define void @and_v32i8(ptr %a) {
 ; CHECK-NEXT:    and z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -106,6 +427,64 @@ define void @and_v16i16(ptr %a) {
 ; CHECK-NEXT:    and z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -122,6 +501,36 @@ define void @and_v8i32(ptr %a) {
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -138,6 +547,26 @@ define void @and_v4i64(ptr %a) {
 ; CHECK-NEXT:    and z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    and x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -158,6 +587,112 @@ define void @ashr_v32i8(ptr %a) {
 ; CHECK-NEXT:    asr z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i32 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -174,6 +709,64 @@ define void @ashr_v16i16(ptr %a) {
 ; CHECK-NEXT:    asr z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -190,6 +783,36 @@ define void @ashr_v8i32(ptr %a) {
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -206,6 +829,26 @@ define void @ashr_v4i64(ptr %a) {
 ; CHECK-NEXT:    asr z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    asr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -229,6 +872,144 @@ define void @icmp_eq_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_eq_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #7
+; NONEON-NOSVE-NEXT:    csetm w8, eq
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -249,6 +1030,80 @@ define void @icmp_sge_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sge_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #14
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -269,6 +1124,44 @@ define void @icmp_sgt_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_sgt_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    csetm w9, gt
+; NONEON-NOSVE-NEXT:    cmn w8, #8
+; NONEON-NOSVE-NEXT:    csetm w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 -8, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -289,6 +1182,30 @@ define void @icmp_ult_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: icmp_ult_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csetm x9, lo
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    csetm x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    csetm x9, lo
+; NONEON-NOSVE-NEXT:    cmp x8, #63
+; NONEON-NOSVE-NEXT:    csetm x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -310,6 +1227,112 @@ define void @lshr_v32i8(ptr %a) {
 ; CHECK-NEXT:    lsr z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #7, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -326,6 +1349,64 @@ define void @lshr_v16i16(ptr %a) {
 ; CHECK-NEXT:    lsr z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #15, #1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -342,6 +1423,36 @@ define void @lshr_v8i32(ptr %a) {
 ; CHECK-NEXT:    lsr z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -358,6 +1469,26 @@ define void @lshr_v4i64(ptr %a) {
 ; CHECK-NEXT:    lsr z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -378,6 +1509,144 @@ define void @mul_v32i8(ptr %a) {
 ; CHECK-NEXT:    mul z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #3
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -394,6 +1663,80 @@ define void @mul_v16i16(ptr %a) {
 ; CHECK-NEXT:    mul z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #4
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -410,6 +1753,48 @@ define void @mul_v8i32(ptr %a) {
 ; CHECK-NEXT:    mul z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #5
+; NONEON-NOSVE-NEXT:    sub w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -426,6 +1811,32 @@ define void @mul_v4i64(ptr %a) {
 ; CHECK-NEXT:    mul z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mul_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #6
+; NONEON-NOSVE-NEXT:    sub x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -446,6 +1857,112 @@ define void @or_v32i8(ptr %a) {
 ; CHECK-NEXT:    orr z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -462,6 +1979,64 @@ define void @or_v16i16(ptr %a) {
 ; CHECK-NEXT:    orr z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -478,6 +2053,36 @@ define void @or_v8i32(ptr %a) {
 ; CHECK-NEXT:    orr z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -494,6 +2099,26 @@ define void @or_v4i64(ptr %a) {
 ; CHECK-NEXT:    orr z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    orr x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -514,6 +2139,112 @@ define void @shl_v32i8(ptr %a) {
 ; CHECK-NEXT:    lsl z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -530,6 +2261,64 @@ define void @shl_v16i16(ptr %a) {
 ; CHECK-NEXT:    lsl z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -546,6 +2335,36 @@ define void @shl_v8i32(ptr %a) {
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    lsl w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    lsl w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -562,6 +2381,26 @@ define void @shl_v4i64(ptr %a) {
 ; CHECK-NEXT:    lsl z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    lsl x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -582,6 +2421,145 @@ define void @smax_v32i8(ptr %a) {
 ; CHECK-NEXT:    smax z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -598,6 +2576,81 @@ define void @smax_v16i16(ptr %a) {
 ; CHECK-NEXT:    smax z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -614,6 +2667,45 @@ define void @smax_v8i32(ptr %a) {
 ; CHECK-NEXT:    smax z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -630,6 +2722,31 @@ define void @smax_v4i64(ptr %a) {
 ; CHECK-NEXT:    smax z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -650,6 +2767,145 @@ define void @smin_v32i8(ptr %a) {
 ; CHECK-NEXT:    smin z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -666,6 +2922,81 @@ define void @smin_v16i16(ptr %a) {
 ; CHECK-NEXT:    smin z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -682,6 +3013,45 @@ define void @smin_v8i32(ptr %a) {
 ; CHECK-NEXT:    smin z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -698,6 +3068,31 @@ define void @smin_v4i64(ptr %a) {
 ; CHECK-NEXT:    smin z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -718,6 +3113,112 @@ define void @sub_v32i8(ptr %a) {
 ; CHECK-NEXT:    sub z1.b, z1.b, #7 // =0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -734,6 +3235,64 @@ define void @sub_v16i16(ptr %a) {
 ; CHECK-NEXT:    sub z1.h, z1.h, #15 // =0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -750,6 +3309,36 @@ define void @sub_v8i32(ptr %a) {
 ; CHECK-NEXT:    sub z1.s, z1.s, #31 // =0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sub w9, w8, #31
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    sub w8, w8, #31
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -766,6 +3355,26 @@ define void @sub_v4i64(ptr %a) {
 ; CHECK-NEXT:    sub z1.d, z1.d, #63 // =0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sub_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sub x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sub x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    sub x9, x8, #63
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    sub x8, x8, #63
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -786,6 +3395,145 @@ define void @umax_v32i8(ptr %a) {
 ; CHECK-NEXT:    umax z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    tst w9, #0xf8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -802,6 +3550,81 @@ define void @umax_v16i16(ptr %a) {
 ; CHECK-NEXT:    umax z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    tst w9, #0xfff0
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -818,6 +3641,45 @@ define void @umax_v8i32(ptr %a) {
 ; CHECK-NEXT:    umax z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -834,6 +3696,31 @@ define void @umax_v4i64(ptr %a) {
 ; CHECK-NEXT:    umax z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -854,6 +3741,145 @@ define void @umin_v32i8(ptr %a) {
 ; CHECK-NEXT:    umin z1.b, z1.b, #7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #7 // =0x7
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #7
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -870,6 +3896,81 @@ define void @umin_v16i16(ptr %a) {
 ; CHECK-NEXT:    umin z1.h, z1.h, #15
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #15 // =0xf
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #15
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -886,6 +3987,45 @@ define void @umin_v8i32(ptr %a) {
 ; CHECK-NEXT:    umin z1.s, z1.s, #31
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #31 // =0x1f
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w9, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w10, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #31
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -902,6 +4042,31 @@ define void @umin_v4i64(ptr %a) {
 ; CHECK-NEXT:    umin z1.d, z1.d, #63
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #63 // =0x3f
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x9, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x10, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #63
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -922,6 +4087,112 @@ define void @xor_v32i8(ptr %a) {
 ; CHECK-NEXT:    eor z1.b, z1.b, #0x7
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %ins = insertelement <32 x i8> undef, i8 7, i64 0
   %op2 = shufflevector <32 x i8> %ins, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -938,6 +4209,64 @@ define void @xor_v16i16(ptr %a) {
 ; CHECK-NEXT:    eor z1.h, z1.h, #0xf
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0xf
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %ins = insertelement <16 x i16> undef, i16 15, i64 0
   %op2 = shufflevector <16 x i16> %ins, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -954,6 +4283,36 @@ define void @xor_v8i32(ptr %a) {
 ; CHECK-NEXT:    eor z1.s, z1.s, #0x1f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w9, w8, #0x1f
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w8, #0x1f
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %ins = insertelement <8 x i32> undef, i32 31, i64 0
   %op2 = shufflevector <8 x i32> %ins, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -970,6 +4329,26 @@ define void @xor_v4i64(ptr %a) {
 ; CHECK-NEXT:    eor z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor x9, x8, #0x3f
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    eor x8, x8, #0x3f
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %ins = insertelement <4 x i64> undef, i64 63, i64 0
   %op2 = shufflevector <4 x i64> %ins, <4 x i64> undef, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
index 40824ba9ae9c5..3137a7bc7ad27 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,47 @@ define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -28,6 +70,78 @@ define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = and <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -41,6 +155,147 @@ define void @and_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = and <32 x i8> %op1, %op2
@@ -56,6 +311,31 @@ define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -68,6 +348,46 @@ define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = and <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -81,6 +401,83 @@ define void @and_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = and <16 x i16> %op1, %op2
@@ -96,6 +493,22 @@ define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -108,6 +521,28 @@ define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = and <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -121,6 +556,47 @@ define void @and_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = and <8 x i32> %op1, %op2
@@ -136,6 +612,18 @@ define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = and <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -148,6 +636,21 @@ define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = and <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -161,6 +664,33 @@ define void @and_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: and_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    and x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = and <4 x i64> %op1, %op2
@@ -180,6 +710,47 @@ define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -192,6 +763,78 @@ define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = or <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -205,6 +848,147 @@ define void @or_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = or <32 x i8> %op1, %op2
@@ -220,6 +1004,31 @@ define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -232,6 +1041,46 @@ define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = or <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -245,6 +1094,83 @@ define void @or_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = or <16 x i16> %op1, %op2
@@ -260,6 +1186,22 @@ define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -272,6 +1214,28 @@ define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = or <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -285,6 +1249,47 @@ define void @or_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = or <8 x i32> %op1, %op2
@@ -300,6 +1305,18 @@ define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = or <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -312,6 +1329,21 @@ define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = or <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -325,6 +1357,33 @@ define void @or_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: or_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    orr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = or <4 x i64> %op1, %op2
@@ -344,6 +1403,47 @@ define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -356,6 +1456,78 @@ define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -369,6 +1541,147 @@ define void @xor_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = xor <32 x i8> %op1, %op2
@@ -384,6 +1697,31 @@ define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -396,6 +1734,46 @@ define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -409,6 +1787,83 @@ define void @xor_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = xor <16 x i16> %op1, %op2
@@ -424,6 +1879,22 @@ define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -436,6 +1907,28 @@ define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -449,6 +1942,47 @@ define void @xor_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = xor <8 x i32> %op1, %op2
@@ -464,6 +1998,18 @@ define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -476,6 +2022,21 @@ define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = xor <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -489,6 +2050,33 @@ define void @xor_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: xor_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    eor x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = xor <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index 74ee5482a60c4..4775a965b70d7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,55 @@ define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -30,6 +80,94 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -45,6 +183,179 @@ define void @smax_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -61,6 +372,35 @@ define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -74,6 +414,54 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -89,6 +477,99 @@ define void @smax_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -105,6 +586,23 @@ define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -118,6 +616,30 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -133,6 +655,51 @@ define void @smax_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, gt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -150,6 +717,19 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -164,6 +744,22 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -179,6 +775,35 @@ define void @smax_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, gt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, gt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -199,6 +824,55 @@ define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -212,6 +886,94 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -227,6 +989,179 @@ define void @smin_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -243,6 +1178,35 @@ define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -256,6 +1220,54 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -271,6 +1283,99 @@ define void @smin_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -287,6 +1392,23 @@ define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -300,6 +1422,30 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -315,6 +1461,51 @@ define void @smin_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lt
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -332,6 +1523,19 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -346,6 +1550,22 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -361,6 +1581,35 @@ define void @smin_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    smin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lt
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lt
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -381,6 +1630,55 @@ define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -394,6 +1692,94 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -409,6 +1795,179 @@ define void @umax_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -425,6 +1984,35 @@ define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -438,6 +2026,54 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -453,6 +2089,99 @@ define void @umax_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -469,6 +2198,23 @@ define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -482,6 +2228,30 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -497,6 +2267,51 @@ define void @umax_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, hi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -514,6 +2329,19 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -528,6 +2356,22 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -543,6 +2387,35 @@ define void @umax_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umax_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, hi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, hi
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
@@ -563,6 +2436,55 @@ define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
   ret <8 x i8> %res
 }
@@ -576,6 +2498,94 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
   ret <16 x i8> %res
 }
@@ -591,6 +2601,179 @@ define void @umin_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
@@ -607,6 +2790,35 @@ define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
   ret <4 x i16> %res
 }
@@ -620,6 +2832,54 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
   ret <8 x i16> %res
 }
@@ -635,6 +2895,99 @@ define void @umin_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
@@ -651,6 +3004,23 @@ define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
   ret <2 x i32> %res
 }
@@ -664,6 +3034,30 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
   ret <4 x i32> %res
 }
@@ -679,6 +3073,51 @@ define void @umin_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w10, w8
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, lo
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
@@ -696,6 +3135,19 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
   ret <1 x i64> %res
 }
@@ -710,6 +3162,22 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
   ret <2 x i64> %res
 }
@@ -725,6 +3193,35 @@ define void @umin_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    umin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umin_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x10, x8
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, lo
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, lo
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
index 3ff6983210a0a..94d5bb1543b0e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible < %s | FileCheck %s -check-prefix=FA64
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -check-prefix=NO-FA64
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -20,6 +21,55 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
 ; NO-FA64-NEXT:    mad z0.b, p0/m, z1.b, z2.b
 ; NO-FA64-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; NO-FA64-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: mla8xi8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #7]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #6]
+; NONEON-NOSVE-NEXT:    madd w1, w2, w1, w5
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #9]
+; NONEON-NOSVE-NEXT:    madd w1, w4, w3, w1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #21]
+; NONEON-NOSVE-NEXT:    madd w18, w0, w18, w1
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #20]
+; NONEON-NOSVE-NEXT:    madd w16, w17, w16, w18
+; NONEON-NOSVE-NEXT:    strb w16, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #19]
+; NONEON-NOSVE-NEXT:    madd w14, w15, w14, w16
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    madd w12, w13, w12, w14
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #17]
+; NONEON-NOSVE-NEXT:    madd w10, w11, w10, w12
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    madd w8, w9, w8, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = mul <8 x i8> %A, %B;
   %tmp2 = add <8 x i8> %C, %tmp1;
   ret <8 x i8> %tmp2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index 8917f43002daf..6198926c0b438 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
 ; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 ; This test only tests the legal types for a given vector width, as mulh nodes
 ; do not get generated for non-legal types.
@@ -36,6 +37,35 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    lsr z0.h, z0.h, #4
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w13
+; NONEON-NOSVE-NEXT:    mul w10, w10, w14
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    ubfx w8, w8, #4, #12
+; NONEON-NOSVE-NEXT:    ubfx w9, w9, #4, #12
+; NONEON-NOSVE-NEXT:    ubfx w10, w10, #4, #12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w8, w11, #4, #12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 4, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
   %1 = sext <4 x i8> %op1 to <4 x i16>
@@ -63,6 +93,55 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    smulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w16
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #8
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #31]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w16
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w18
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 8, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
   %1 = sext <8 x i8> %op1 to <8 x i16>
@@ -90,6 +169,120 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    smulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    str x27, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #96] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w6, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w7, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w19, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w20, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w25, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w26, [sp, #60]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w20, w20, w21
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w19, w19, w23
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    mul w7, w7, w25
+; NONEON-NOSVE-NEXT:    ldrsb w2, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w3, [sp, #42]
+; NONEON-NOSVE-NEXT:    mul w6, w6, w26
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #8
+; NONEON-NOSVE-NEXT:    ldrsb w4, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #88]
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #8
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #90]
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #8
+; NONEON-NOSVE-NEXT:    ldrsb w1, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w5, [sp, #92]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w6, w6, #8
+; NONEON-NOSVE-NEXT:    ldrsb w22, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w24, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w1
+; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w5
+; NONEON-NOSVE-NEXT:    ldrsb w27, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w25, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w24
+; NONEON-NOSVE-NEXT:    ldrsb w26, [sp, #57]
+; NONEON-NOSVE-NEXT:    mul w0, w0, w23
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    mul w4, w4, w27
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    mul w3, w3, w25
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #79]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w26
+; NONEON-NOSVE-NEXT:    lsr w0, w0, #8
+; NONEON-NOSVE-NEXT:    strb w19, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w17, w17, w21
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #8
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #77]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w22
+; NONEON-NOSVE-NEXT:    lsr w3, w3, #8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #8
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #75]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w14
+; NONEON-NOSVE-NEXT:    lsr w17, w17, #8
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #71]
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #66]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <16 x i8> %op1 to <16 x i16>
   %2 = sext <16 x i8> %op2 to <16 x i16>
   %mul = mul <16 x i16> %1, %2
@@ -118,6 +311,255 @@ define void @smulh_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #384
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #304] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #320] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 384
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov x29, x0
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #185]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #187]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #189]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #229]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #227]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #228]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #190]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #191]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #177]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldrsb w2, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrsb w1, [sp, #215]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #179]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrsb w4, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrsb w3, [sp, #213]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #181]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #247]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #246]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #244]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #183]
+; NONEON-NOSVE-NEXT:    mul w26, w12, w16
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #242]
+; NONEON-NOSVE-NEXT:    ldrsb w16, [sp, #250]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #233]
+; NONEON-NOSVE-NEXT:    mul w30, w10, w12
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #255]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #253]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #235]
+; NONEON-NOSVE-NEXT:    ldrsb w0, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #249]
+; NONEON-NOSVE-NEXT:    ldrsb w6, [sp, #210]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #237]
+; NONEON-NOSVE-NEXT:    ldrsb w5, [sp, #211]
+; NONEON-NOSVE-NEXT:    ldrsb w19, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrsb w7, [sp, #209]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #239]
+; NONEON-NOSVE-NEXT:    ldrsb w21, [sp, #222]
+; NONEON-NOSVE-NEXT:    ldrsb w20, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldrsb w23, [sp, #220]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #225]
+; NONEON-NOSVE-NEXT:    ldrsb w22, [sp, #221]
+; NONEON-NOSVE-NEXT:    ldrsb w24, [sp, #219]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #231]
+; NONEON-NOSVE-NEXT:    mul w27, w8, w14
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #245]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #217]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w15
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #251]
+; NONEON-NOSVE-NEXT:    mul w25, w13, w14
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #243]
+; NONEON-NOSVE-NEXT:    lsr w14, w27, #8
+; NONEON-NOSVE-NEXT:    ldrsb w27, [sp, #218]
+; NONEON-NOSVE-NEXT:    lsr w17, w9, #8
+; NONEON-NOSVE-NEXT:    mul w28, w11, w13
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #287]
+; NONEON-NOSVE-NEXT:    lsr w14, w25, #8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #241]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #285]
+; NONEON-NOSVE-NEXT:    lsr w14, w28, #8
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #254]
+; NONEON-NOSVE-NEXT:    mul w8, w25, w8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #252]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #283]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w9, w25, w9
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #286]
+; NONEON-NOSVE-NEXT:    mul w12, w14, w12
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    lsr w17, w26, #8
+; NONEON-NOSVE-NEXT:    mul w10, w25, w10
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
+; NONEON-NOSVE-NEXT:    mul w11, w25, w11
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #284]
+; NONEON-NOSVE-NEXT:    lsr w17, w30, #8
+; NONEON-NOSVE-NEXT:    mul w13, w14, w13
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #320] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w15
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w16
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w0
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w2
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #274]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w3
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w13, w13, w4
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w5
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #271]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w6
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #269]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    mul w13, w13, w19
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #268]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #267]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w20
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w21
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #265]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w15, w14, [sp, #16] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #263]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w27
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #261]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w15
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #282]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w14
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #260]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #259]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #257]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #304] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x29]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #384
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %1 = sext <32 x i8> %op1 to <32 x i16>
@@ -153,6 +595,24 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #16
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w10
+; NONEON-NOSVE-NEXT:    mul w9, w9, w11
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i16> %op1 to <2 x i32>
   %2 = sext <2 x i16> %op2 to <2 x i32>
   %mul = mul <2 x i32> %1, %2
@@ -178,6 +638,35 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    smulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w13
+; NONEON-NOSVE-NEXT:    mul w9, w9, w14
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i16> %op1 to <4 x i32>
   %2 = sext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
@@ -203,6 +692,58 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    smulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w15, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w0, [sp, #62]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w18
+; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w16
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #16
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    strh w15, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #68]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <8 x i16> %op1 to <8 x i32>
   %2 = sext <8 x i16> %op2 to <8 x i32>
   %mul = mul <8 x i32> %1, %2
@@ -231,6 +772,129 @@ define void @smulh_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #240
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 240
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w13, [sp, #50]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w7, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrsh w19, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrsh w20, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldrsh w21, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrsh w14, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w18, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrsh w2, [sp, #106]
+; NONEON-NOSVE-NEXT:    ldrsh w4, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldrsh w5, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldrsh w15, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w17, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w1, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w3, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w15
+; NONEON-NOSVE-NEXT:    ldrsh w6, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w23, [sp, #82]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w3
+; NONEON-NOSVE-NEXT:    ldrsh w25, [sp, #84]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w25
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    mul w12, w12, w6
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    ldrsh w22, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrsh w24, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrsh w26, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrsh w27, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w28, [sp, #126]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w17
+; NONEON-NOSVE-NEXT:    mul w21, w21, w22
+; NONEON-NOSVE-NEXT:    ldrsh w22, [sp, #86]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w20, w20, w24
+; NONEON-NOSVE-NEXT:    ldrsh w24, [sp, #120]
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    mul w19, w19, w26
+; NONEON-NOSVE-NEXT:    ldrsh w26, [sp, #124]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    mul w7, w7, w27
+; NONEON-NOSVE-NEXT:    ldrsh w27, [sp, #122]
+; NONEON-NOSVE-NEXT:    lsr w21, w21, #16
+; NONEON-NOSVE-NEXT:    mul w5, w5, w28
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    mul w4, w4, w26
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #16
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #158]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w27
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #16
+; NONEON-NOSVE-NEXT:    strh w20, [sp, #156]
+; NONEON-NOSVE-NEXT:    mul w18, w18, w24
+; NONEON-NOSVE-NEXT:    lsr w5, w5, #16
+; NONEON-NOSVE-NEXT:    strh w19, [sp, #154]
+; NONEON-NOSVE-NEXT:    mul w16, w16, w22
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #16
+; NONEON-NOSVE-NEXT:    strh w7, [sp, #152]
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #16
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #150]
+; NONEON-NOSVE-NEXT:    lsr w18, w18, #16
+; NONEON-NOSVE-NEXT:    strh w4, [sp, #148]
+; NONEON-NOSVE-NEXT:    lsr w16, w16, #16
+; NONEON-NOSVE-NEXT:    strh w2, [sp, #146]
+; NONEON-NOSVE-NEXT:    strh w18, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w16, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #132]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #240
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %1 = sext <16 x i16> %op1 to <16 x i32>
@@ -259,6 +923,22 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    smulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldpsw x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    smull x9, w9, w10
+; NONEON-NOSVE-NEXT:    smull x8, w8, w11
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i32> %op1 to <2 x i64>
   %2 = sext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -284,6 +964,32 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    smulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldpsw x10, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x13, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    smull x11, w11, w12
+; NONEON-NOSVE-NEXT:    ldpsw x12, x14, [sp, #56]
+; NONEON-NOSVE-NEXT:    smull x10, w10, w13
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    smull x9, w9, w14
+; NONEON-NOSVE-NEXT:    smull x8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <4 x i32> %op1 to <4 x i64>
   %2 = sext <4 x i32> %op2 to <4 x i64>
   %mul = mul <4 x i64> %1, %2
@@ -312,6 +1018,56 @@ define void @smulh_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldpsw x8, x9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldpsw x10, x11, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldpsw x12, x13, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldpsw x14, x15, [sp, #96]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldpsw x17, x16, [sp, #112]
+; NONEON-NOSVE-NEXT:    smull x15, w15, w16
+; NONEON-NOSVE-NEXT:    ldpsw x16, x18, [sp, #120]
+; NONEON-NOSVE-NEXT:    smull x14, w14, w17
+; NONEON-NOSVE-NEXT:    ldpsw x17, x1, [sp, #80]
+; NONEON-NOSVE-NEXT:    smull x13, w13, w18
+; NONEON-NOSVE-NEXT:    lsr x15, x15, #32
+; NONEON-NOSVE-NEXT:    smull x12, w12, w16
+; NONEON-NOSVE-NEXT:    lsr x14, x14, #32
+; NONEON-NOSVE-NEXT:    ldpsw x16, x18, [sp, #88]
+; NONEON-NOSVE-NEXT:    smull x11, w11, w1
+; NONEON-NOSVE-NEXT:    lsr x13, x13, #32
+; NONEON-NOSVE-NEXT:    stp w14, w15, [sp, #152]
+; NONEON-NOSVE-NEXT:    smull x10, w10, w17
+; NONEON-NOSVE-NEXT:    lsr x12, x12, #32
+; NONEON-NOSVE-NEXT:    smull x9, w9, w18
+; NONEON-NOSVE-NEXT:    smull x8, w8, w16
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    stp w12, w13, [sp, #144]
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %1 = sext <8 x i32> %op1 to <8 x i64>
@@ -340,6 +1096,18 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    smulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    fmov x9, d1
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i128> undef, i128 64, i128 0
   %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
   %1 = sext <1 x i64> %op1 to <1 x i128>
@@ -367,6 +1135,21 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    smulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x10
+; NONEON-NOSVE-NEXT:    smulh x9, x9, x11
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
@@ -395,6 +1178,33 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    smulh z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smulh_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    smulh x10, x10, x12
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    smulh x11, x11, x13
+; NONEON-NOSVE-NEXT:    smulh x8, x8, x12
+; NONEON-NOSVE-NEXT:    smulh x9, x9, x14
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %1 = sext <4 x i64> %op1 to <4 x i128>
@@ -433,6 +1243,35 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE2-NEXT:    lsr z0.h, z0.h, #4
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w13
+; NONEON-NOSVE-NEXT:    mul w10, w10, w14
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #4
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #4
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w11, #4
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i8> %op1 to <4 x i16>
   %2 = zext <4 x i8> %op2 to <4 x i16>
   %mul = mul <4 x i16> %1, %2
@@ -458,6 +1297,55 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE2-NEXT:    umulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #22]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w16
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #8
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #17]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #31]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w16
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #30]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w18
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i8> %op1 to <8 x i16>
   %2 = zext <8 x i8> %op2 to <8 x i16>
   %mul = mul <8 x i16> %1, %2
@@ -483,6 +1371,120 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE2-NEXT:    umulh z0.b, z0.b, z1.b
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    str x27, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #96] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -80
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d2, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #60]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #52]
+; NONEON-NOSVE-NEXT:    mul w20, w20, w21
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #54]
+; NONEON-NOSVE-NEXT:    mul w19, w19, w23
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    mul w7, w7, w25
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #42]
+; NONEON-NOSVE-NEXT:    mul w6, w6, w26
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #8
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #88]
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #90]
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #8
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #92]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w6, w6, #8
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w1
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w5
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #58]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w24
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #57]
+; NONEON-NOSVE-NEXT:    mul w0, w0, w23
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #8
+; NONEON-NOSVE-NEXT:    mul w4, w4, w27
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #8
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    mul w3, w3, w25
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #8
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #79]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w26
+; NONEON-NOSVE-NEXT:    lsr w0, w0, #8
+; NONEON-NOSVE-NEXT:    strb w19, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w17, w17, w21
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #8
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #77]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w22
+; NONEON-NOSVE-NEXT:    lsr w3, w3, #8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #8
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #75]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w14
+; NONEON-NOSVE-NEXT:    lsr w17, w17, #8
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #8
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #8
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #71]
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #66]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <16 x i8> %op1 to <16 x i16>
   %2 = zext <16 x i8> %op2 to <16 x i16>
   %mul = mul <16 x i16> %1, %2
@@ -511,6 +1513,255 @@ define void @umulh_v32i8(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #384
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #304] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #320] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 384
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov x29, x0
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #185]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #187]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #189]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #229]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #227]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #228]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #190]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #191]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #177]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #215]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #179]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #213]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #181]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #247]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #246]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #244]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #183]
+; NONEON-NOSVE-NEXT:    mul w26, w12, w16
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #242]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #250]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #233]
+; NONEON-NOSVE-NEXT:    mul w30, w10, w12
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #255]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #253]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #234]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #235]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #249]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #210]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #237]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #211]
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #209]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #239]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #222]
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #220]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #225]
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #221]
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #219]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #230]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #231]
+; NONEON-NOSVE-NEXT:    mul w27, w8, w14
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #245]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #217]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #251]
+; NONEON-NOSVE-NEXT:    mul w25, w13, w14
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #243]
+; NONEON-NOSVE-NEXT:    lsr w14, w27, #8
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #218]
+; NONEON-NOSVE-NEXT:    lsr w17, w9, #8
+; NONEON-NOSVE-NEXT:    mul w28, w11, w13
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #287]
+; NONEON-NOSVE-NEXT:    lsr w14, w25, #8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #241]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #285]
+; NONEON-NOSVE-NEXT:    lsr w14, w28, #8
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #254]
+; NONEON-NOSVE-NEXT:    mul w8, w25, w8
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #252]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #283]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w9, w25, w9
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #286]
+; NONEON-NOSVE-NEXT:    mul w12, w14, w12
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #8
+; NONEON-NOSVE-NEXT:    lsr w17, w26, #8
+; NONEON-NOSVE-NEXT:    mul w10, w25, w10
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #281]
+; NONEON-NOSVE-NEXT:    mul w11, w25, w11
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #284]
+; NONEON-NOSVE-NEXT:    lsr w17, w30, #8
+; NONEON-NOSVE-NEXT:    mul w13, w14, w13
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #320] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w15
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #279]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w16
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #278]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    mul w12, w12, w18
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #277]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w0
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #275]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w2
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #274]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #273]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w3
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w13, w13, w4
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    mul w10, w10, w5
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #271]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #270]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w6
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #269]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    mul w13, w13, w19
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #268]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #267]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w20
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w11, w11, w21
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #266]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w12, w12, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #265]
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp w15, w14, [sp, #16] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    mul w10, w10, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #263]
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    mul w11, w11, w27
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #262]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #261]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w15
+; NONEON-NOSVE-NEXT:    lsr w8, w10, #8
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #282]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w14
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #260]
+; NONEON-NOSVE-NEXT:    lsr w9, w11, #8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #259]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    lsr w8, w12, #8
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #258]
+; NONEON-NOSVE-NEXT:    lsr w9, w13, #8
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #257]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #304] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x29]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #384
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %1 = zext <32 x i8> %op1 to <32 x i16>
@@ -545,6 +1796,24 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE2-NEXT:    lsr z0.s, z0.s, #16
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w10
+; NONEON-NOSVE-NEXT:    mul w9, w9, w11
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i16> %op1 to <2 x i32>
   %2 = zext <2 x i16> %op2 to <2 x i32>
   %mul = mul <2 x i32> %1, %2
@@ -570,6 +1839,35 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE2-NEXT:    umulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w12
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #16]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w13
+; NONEON-NOSVE-NEXT:    mul w9, w9, w14
+; NONEON-NOSVE-NEXT:    mul w8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i16> %op1 to <4 x i32>
   %2 = zext <4 x i16> %op2 to <4 x i32>
   %mul = mul <4 x i32> %1, %2
@@ -595,6 +1893,58 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE2-NEXT:    umulh z0.h, z0.h, z1.h
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #62]
+; NONEON-NOSVE-NEXT:    mul w15, w15, w16
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w17
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #56]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w18
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #60]
+; NONEON-NOSVE-NEXT:    mul w12, w12, w16
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w15, w15, #16
+; NONEON-NOSVE-NEXT:    mul w11, w11, w0
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w10, w10, w18
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    strh w15, [sp, #78]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w16
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #76]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w17
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #74]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #68]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <8 x i16> %op1 to <8 x i32>
   %2 = zext <8 x i16> %op2 to <8 x i32>
   %mul = mul <8 x i32> %1, %2
@@ -623,6 +1973,129 @@ define void @umulh_v16i16(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #240
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 240
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #50]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldrh w21, [sp, #102]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #54]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #106]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #94]
+; NONEON-NOSVE-NEXT:    mul w8, w8, w15
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #82]
+; NONEON-NOSVE-NEXT:    mul w11, w11, w3
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #84]
+; NONEON-NOSVE-NEXT:    mul w13, w13, w23
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    mul w14, w14, w25
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    mul w12, w12, w6
+; NONEON-NOSVE-NEXT:    lsr w11, w11, #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    mul w10, w10, w1
+; NONEON-NOSVE-NEXT:    lsr w13, w13, #16
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #126]
+; NONEON-NOSVE-NEXT:    mul w9, w9, w17
+; NONEON-NOSVE-NEXT:    mul w21, w21, w22
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #86]
+; NONEON-NOSVE-NEXT:    lsr w14, w14, #16
+; NONEON-NOSVE-NEXT:    mul w20, w20, w24
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #120]
+; NONEON-NOSVE-NEXT:    lsr w12, w12, #16
+; NONEON-NOSVE-NEXT:    mul w19, w19, w26
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #124]
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #16
+; NONEON-NOSVE-NEXT:    mul w7, w7, w27
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #122]
+; NONEON-NOSVE-NEXT:    lsr w21, w21, #16
+; NONEON-NOSVE-NEXT:    mul w5, w5, w28
+; NONEON-NOSVE-NEXT:    lsr w20, w20, #16
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #16
+; NONEON-NOSVE-NEXT:    mul w4, w4, w26
+; NONEON-NOSVE-NEXT:    lsr w19, w19, #16
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #158]
+; NONEON-NOSVE-NEXT:    mul w2, w2, w27
+; NONEON-NOSVE-NEXT:    lsr w7, w7, #16
+; NONEON-NOSVE-NEXT:    strh w20, [sp, #156]
+; NONEON-NOSVE-NEXT:    mul w18, w18, w24
+; NONEON-NOSVE-NEXT:    lsr w5, w5, #16
+; NONEON-NOSVE-NEXT:    strh w19, [sp, #154]
+; NONEON-NOSVE-NEXT:    mul w16, w16, w22
+; NONEON-NOSVE-NEXT:    lsr w4, w4, #16
+; NONEON-NOSVE-NEXT:    strh w7, [sp, #152]
+; NONEON-NOSVE-NEXT:    lsr w2, w2, #16
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #150]
+; NONEON-NOSVE-NEXT:    lsr w18, w18, #16
+; NONEON-NOSVE-NEXT:    strh w4, [sp, #148]
+; NONEON-NOSVE-NEXT:    lsr w16, w16, #16
+; NONEON-NOSVE-NEXT:    strh w2, [sp, #146]
+; NONEON-NOSVE-NEXT:    strh w18, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w16, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #134]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #132]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #240
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %1 = zext <16 x i16> %op1 to <16 x i32>
@@ -651,6 +2124,22 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE2-NEXT:    umulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    umull x9, w9, w10
+; NONEON-NOSVE-NEXT:    umull x8, w8, w11
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i32> %op1 to <2 x i64>
   %2 = zext <2 x i32> %op2 to <2 x i64>
   %mul = mul <2 x i64> %1, %2
@@ -676,6 +2165,32 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE2-NEXT:    umulh z0.s, z0.s, z1.s
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w13, w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    umull x11, w11, w12
+; NONEON-NOSVE-NEXT:    ldp w12, w14, [sp, #56]
+; NONEON-NOSVE-NEXT:    umull x10, w10, w13
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    umull x9, w9, w14
+; NONEON-NOSVE-NEXT:    umull x8, w8, w12
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <4 x i32> %op1 to <4 x i64>
   %2 = zext <4 x i32> %op2 to <4 x i64>
   %mul = mul <4 x i64> %1, %2
@@ -704,6 +2219,56 @@ define void @umulh_v8i32(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w17, w16, [sp, #112]
+; NONEON-NOSVE-NEXT:    umull x15, w15, w16
+; NONEON-NOSVE-NEXT:    ldp w16, w18, [sp, #120]
+; NONEON-NOSVE-NEXT:    umull x14, w14, w17
+; NONEON-NOSVE-NEXT:    ldp w17, w1, [sp, #80]
+; NONEON-NOSVE-NEXT:    umull x13, w13, w18
+; NONEON-NOSVE-NEXT:    lsr x15, x15, #32
+; NONEON-NOSVE-NEXT:    umull x12, w12, w16
+; NONEON-NOSVE-NEXT:    lsr x14, x14, #32
+; NONEON-NOSVE-NEXT:    ldp w16, w18, [sp, #88]
+; NONEON-NOSVE-NEXT:    umull x11, w11, w1
+; NONEON-NOSVE-NEXT:    lsr x13, x13, #32
+; NONEON-NOSVE-NEXT:    stp w14, w15, [sp, #152]
+; NONEON-NOSVE-NEXT:    umull x10, w10, w17
+; NONEON-NOSVE-NEXT:    lsr x12, x12, #32
+; NONEON-NOSVE-NEXT:    umull x9, w9, w18
+; NONEON-NOSVE-NEXT:    umull x8, w8, w16
+; NONEON-NOSVE-NEXT:    lsr x11, x11, #32
+; NONEON-NOSVE-NEXT:    stp w12, w13, [sp, #144]
+; NONEON-NOSVE-NEXT:    lsr x10, x10, #32
+; NONEON-NOSVE-NEXT:    lsr x9, x9, #32
+; NONEON-NOSVE-NEXT:    lsr x8, x8, #32
+; NONEON-NOSVE-NEXT:    stp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %insert = insertelement <8 x i64> undef, i64 32, i64 0
@@ -734,6 +2299,18 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE2-NEXT:    umulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    fmov x9, d1
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <1 x i64> %op1 to <1 x i128>
   %2 = zext <1 x i64> %op2 to <1 x i128>
   %mul = mul <1 x i128> %1, %2
@@ -759,6 +2336,21 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE2-NEXT:    umulh z0.d, z0.d, z1.d
 ; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x10
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x11
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
   %mul = mul <2 x i128> %1, %2
@@ -787,6 +2379,33 @@ define void @umulh_v4i64(ptr %a, ptr %b) {
 ; SVE2-NEXT:    umulh z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umulh_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    umulh x10, x10, x12
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    umulh x11, x11, x13
+; NONEON-NOSVE-NEXT:    umulh x8, x8, x12
+; NONEON-NOSVE-NEXT:    umulh x9, x9, x14
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %1 = zext <4 x i64> %op1 to <4 x i128>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 1123907f33899..7bdb4599707b0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -17,6 +18,29 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w10, w12, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w14
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -30,6 +54,44 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w11, w14, w13
+; NONEON-NOSVE-NEXT:    add w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w12, w12, w14
+; NONEON-NOSVE-NEXT:    add w8, w8, w11
+; NONEON-NOSVE-NEXT:    add w9, w10, w9
+; NONEON-NOSVE-NEXT:    add w10, w12, w16
+; NONEON-NOSVE-NEXT:    add w8, w8, w15
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w13
+; NONEON-NOSVE-NEXT:    add w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -44,6 +106,77 @@ define i8 @uaddv_v32i8(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w9, w11, w10
+; NONEON-NOSVE-NEXT:    add w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    add w11, w15, w14
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w14, w15, w14
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w9, w9, w14
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w10, w10, w12
+; NONEON-NOSVE-NEXT:    add w11, w16, w11
+; NONEON-NOSVE-NEXT:    add w10, w10, w11
+; NONEON-NOSVE-NEXT:    add w11, w17, w13
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -58,6 +191,21 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -71,6 +219,28 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w10, w12, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w14
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -85,6 +255,45 @@ define i16 @uaddv_v16i16(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w9, w11, w10
+; NONEON-NOSVE-NEXT:    add w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w13, w15, w14
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w9, w12, w13
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w10, w14, w10
+; NONEON-NOSVE-NEXT:    add w11, w15, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w13, w12
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -99,6 +308,16 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -112,6 +331,17 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w0, w10, w8
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -126,6 +356,25 @@ define i32 @uaddv_v8i32(ptr %a) {
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w9, w11, w9
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w10, w14, w12
+; NONEON-NOSVE-NEXT:    add w11, w15, w13
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -139,6 +388,14 @@ define i64 @uaddv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    add x0, x9, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -152,6 +409,18 @@ define i64 @uaddv_v4i64(ptr %a) {
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uaddv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    add x8, x10, x8
+; NONEON-NOSVE-NEXT:    add x9, x11, x9
+; NONEON-NOSVE-NEXT:    add x0, x9, x8
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -169,6 +438,36 @@ define i8 @smaxv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -181,6 +480,59 @@ define i8 @smaxv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -194,6 +546,108 @@ define i8 @smaxv_v32i8(ptr %a) {
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -207,6 +661,24 @@ define i16 @smaxv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -219,6 +691,35 @@ define i16 @smaxv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -232,6 +733,60 @@ define i16 @smaxv_v16i16(ptr %a) {
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -245,6 +800,17 @@ define i32 @smaxv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -257,6 +823,21 @@ define i32 @smaxv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -270,6 +851,32 @@ define i32 @smaxv_v8i32(ptr %a) {
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, gt
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, gt
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, gt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, gt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -284,6 +891,15 @@ define i64 @smaxv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -297,6 +913,22 @@ define i64 @smaxv_v4i64(ptr %a) {
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: smaxv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, gt
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, gt
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, gt
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -314,6 +946,36 @@ define i8 @sminv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -326,6 +988,59 @@ define i8 @sminv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -339,6 +1054,108 @@ define i8 @sminv_v32i8(ptr %a) {
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -352,6 +1169,24 @@ define i16 @sminv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -364,6 +1199,35 @@ define i16 @sminv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -377,6 +1241,60 @@ define i16 @sminv_v16i16(ptr %a) {
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    ldrsh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -390,6 +1308,17 @@ define i32 @sminv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -402,6 +1331,21 @@ define i32 @sminv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -415,6 +1359,32 @@ define i32 @sminv_v8i32(ptr %a) {
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lt
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lt
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, lt
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lt
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -429,6 +1399,15 @@ define i64 @sminv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -442,6 +1421,22 @@ define i64 @sminv_v4i64(ptr %a) {
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sminv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, lt
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, lt
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lt
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -459,6 +1454,36 @@ define i8 @umaxv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -471,6 +1496,59 @@ define i8 @umaxv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -484,6 +1562,108 @@ define i8 @umaxv_v32i8(ptr %a) {
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -497,6 +1677,24 @@ define i16 @umaxv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -509,6 +1707,35 @@ define i16 @umaxv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -522,6 +1749,60 @@ define i16 @umaxv_v16i16(ptr %a) {
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -535,6 +1816,17 @@ define i32 @umaxv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -547,6 +1839,21 @@ define i32 @umaxv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -560,6 +1867,32 @@ define i32 @umaxv_v8i32(ptr %a) {
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, hi
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -574,6 +1907,15 @@ define i64 @umaxv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -587,6 +1929,22 @@ define i64 @umaxv_v4i64(ptr %a) {
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: umaxv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, hi
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, hi
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, hi
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -604,6 +1962,36 @@ define i8 @uminv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -616,6 +2004,59 @@ define i8 @uminv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -629,6 +2070,108 @@ define i8 @uminv_v32i8(ptr %a) {
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #3]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #5]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #7]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #9]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #11]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #15]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -642,6 +2185,24 @@ define i16 @uminv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -654,6 +2215,35 @@ define i16 @uminv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -667,6 +2257,60 @@ define i16 @uminv_v16i16(ptr %a) {
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #14]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -680,6 +2324,17 @@ define i32 @uminv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w0, w9, w8, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -692,6 +2347,21 @@ define i32 @uminv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -705,6 +2375,32 @@ define i32 @uminv_v8i32(ptr %a) {
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w11, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldp w10, w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    ldp w11, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w10, w11
+; NONEON-NOSVE-NEXT:    csel w10, w10, w11, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    cmp w12, w9
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w0, w8, w9, lo
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -719,6 +2415,15 @@ define i64 @uminv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -732,6 +2437,22 @@ define i64 @uminv_v4i64(ptr %a) {
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uminv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp], #32
+; NONEON-NOSVE-NEXT:    cmp x8, x9
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, lo
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, lo
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x0, x9, x8, lo
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op)
   ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index 4ae7586fca169..cb1fb20ec9d8d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -24,6 +25,35 @@ define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w10, w16, w14, w15
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -53,6 +83,55 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -102,6 +181,94 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -189,6 +356,179 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = srem <32 x i8> %op1, %op2
@@ -210,6 +550,35 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -238,6 +607,54 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -282,6 +699,99 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = srem <16 x i16> %op1, %op2
@@ -300,6 +810,24 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -315,6 +843,32 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -334,6 +888,54 @@ define void @srem_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    sdiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    sdiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = srem <8 x i32> %op1, %op2
@@ -352,6 +954,19 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -367,6 +982,23 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    sdiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = srem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -386,6 +1018,37 @@ define void @srem_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: srem_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    sdiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x11, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sdiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = srem <4 x i64> %op1, %op2
@@ -413,6 +1076,35 @@ define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    udiv w13, w12, w11
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w16, w15, w14
+; NONEON-NOSVE-NEXT:    msub w9, w13, w11, w12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w0, w18, w17
+; NONEON-NOSVE-NEXT:    msub w10, w16, w14, w15
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    msub w8, w0, w17, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -442,6 +1134,55 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -491,6 +1232,94 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -578,6 +1407,179 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = urem <32 x i8> %op1, %op2
@@ -599,6 +1601,35 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -627,6 +1658,54 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -671,6 +1750,99 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = urem <16 x i16> %op1, %op2
@@ -689,6 +1861,24 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -704,6 +1894,32 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -723,6 +1939,54 @@ define void @urem_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    udiv w10, w11, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w11
+; NONEON-NOSVE-NEXT:    str w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w11, w10, w8, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    udiv w10, w9, w8
+; NONEON-NOSVE-NEXT:    msub w8, w10, w8, w9
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = urem <8 x i32> %op1, %op2
@@ -741,6 +2005,19 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -756,6 +2033,23 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    udiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = urem <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -775,6 +2069,37 @@ define void @urem_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: urem_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    udiv x10, x11, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x11
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x11, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    udiv x10, x9, x8
+; NONEON-NOSVE-NEXT:    msub x8, x10, x8, x9
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = urem <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
index bfffe4b6315d7..5cee1360f6f3c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -16,6 +17,32 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
 }
@@ -31,6 +58,48 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
 }
@@ -46,6 +115,79 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
 }
@@ -64,6 +206,151 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.b, p0, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #61]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #59]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #57]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #53]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <32 x i8>, ptr %a
   %op2 = load volatile <32 x i8>, ptr %b
   %sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2
@@ -83,6 +370,22 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
 }
@@ -99,6 +402,32 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
 }
@@ -115,6 +444,47 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
 }
@@ -134,6 +504,87 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <16 x i16>, ptr %a
   %op2 = load volatile <16 x i16>, ptr %b
   %sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2
@@ -153,6 +604,22 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
 }
@@ -169,6 +636,27 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w10, w8, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
 }
@@ -188,6 +676,47 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w11, w8, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, ne
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <8 x i32>, ptr %a
   %op2 = load volatile <8 x i32>, ptr %b
   %sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2
@@ -208,6 +737,19 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, ne
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
 }
@@ -225,6 +767,21 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel x11, x10, x8, ne
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, ne
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
 }
@@ -245,6 +802,35 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0, #16]
+; NONEON-NOSVE-NEXT:    tst w2, #0x1
+; NONEON-NOSVE-NEXT:    ldr q2, [x1]
+; NONEON-NOSVE-NEXT:    ldr q3, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel x11, x8, x10, ne
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, ne
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel x11, x8, x10, ne
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    csel x8, x8, x9, ne
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load volatile <4 x i64>, ptr %a
   %op2 = load volatile <4 x i64>, ptr %b
   %sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index 9319bd69c25fb..2778e93416a74 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,6 +20,31 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w13, [sp, #10]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsb w14, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w10, w11, w10
+; NONEON-NOSVE-NEXT:    asr w11, w13, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w14, w9
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -32,6 +58,47 @@ define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -45,6 +112,78 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -60,6 +199,147 @@ define void @ashr_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = ashr <32 x i8> %op1, %op2
@@ -78,6 +358,22 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    asr w9, w11, w10
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -91,6 +387,31 @@ define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -104,6 +425,46 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -119,6 +480,83 @@ define void @ashr_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = ashr <16 x i16> %op1, %op2
@@ -135,6 +573,21 @@ define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -148,6 +601,26 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -163,6 +636,43 @@ define void @ashr_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = ashr <8 x i32> %op1, %op2
@@ -179,6 +689,18 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -192,6 +714,20 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = ashr <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -207,6 +743,31 @@ define void @ashr_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    asr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ashr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    asr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = ashr <4 x i64> %op1, %op2
@@ -229,6 +790,31 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w10, w11, w10
+; NONEON-NOSVE-NEXT:    lsr w11, w13, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w14, w9
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -242,6 +828,47 @@ define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -255,6 +882,78 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -270,6 +969,147 @@ define void @lshr_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = lshr <32 x i8> %op1, %op2
@@ -288,6 +1128,22 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w11, w10
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i16> %op1, %op2
   ret <2 x i16> %res
 }
@@ -301,6 +1157,31 @@ define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -314,6 +1195,46 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -329,6 +1250,83 @@ define void @lshr_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = lshr <16 x i16> %op1, %op2
@@ -345,6 +1343,21 @@ define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -358,6 +1371,26 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -373,6 +1406,43 @@ define void @lshr_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsr w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = lshr <8 x i32> %op1, %op2
@@ -389,6 +1459,18 @@ define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -402,6 +1484,20 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = lshr <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -417,6 +1513,31 @@ define void @lshr_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: lshr_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsr x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsr x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = lshr <4 x i64> %op1, %op2
@@ -438,6 +1559,22 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v2i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i8> %op1, %op2
   ret <2 x i8> %res
 }
@@ -452,6 +1589,31 @@ define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w11, w12, w11
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #12]
+; NONEON-NOSVE-NEXT:    lsl w10, w11, w10
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    lsl w9, w10, w9
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i8> %op1, %op2
   ret <4 x i8> %res
 }
@@ -465,6 +1627,47 @@ define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i8> %op1, %op2
   ret <8 x i8> %res
 }
@@ -478,6 +1681,78 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <16 x i8> %op1, %op2
   ret <16 x i8> %res
 }
@@ -493,6 +1768,147 @@ define void @shl_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = shl <32 x i8> %op1, %op2
@@ -509,6 +1925,31 @@ define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i16> %op1, %op2
   ret <4 x i16> %res
 }
@@ -522,6 +1963,46 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <8 x i16> %op1, %op2
   ret <8 x i16> %res
 }
@@ -537,6 +2018,83 @@ define void @shl_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = shl <16 x i16> %op1, %op2
@@ -553,6 +2111,21 @@ define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i32> %op1, %op2
   ret <2 x i32> %res
 }
@@ -566,6 +2139,26 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <4 x i32> %op1, %op2
   ret <4 x i32> %res
 }
@@ -581,6 +2174,43 @@ define void @shl_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldp w9, w10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    lsl w11, w10, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %res = shl <8 x i32> %op1, %op2
@@ -597,6 +2227,18 @@ define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <1 x i64> %op1, %op2
   ret <1 x i64> %res
 }
@@ -610,6 +2252,20 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %res = shl <2 x i64> %op1, %op2
   ret <2 x i64> %res
 }
@@ -625,6 +2281,31 @@ define void @shl_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shl_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    lsl x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    ldp x9, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    lsl x11, x10, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    lsl x8, x9, x8
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %res = shl <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 27dbfc9a23a8d..fd2d9a8fb80d1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,6 +16,30 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -27,6 +52,48 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -42,6 +109,80 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -61,6 +202,19 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -74,6 +228,25 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -90,6 +263,38 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -114,6 +319,62 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    ucvtf s1, s0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -132,6 +393,17 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; CHECK-NEXT:    and w8, w8, #0xffff
 ; CHECK-NEXT:    ucvtf d0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <1 x i16> %op1 to <1 x double>
   ret <1 x double> %res
 }
@@ -146,6 +418,20 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -163,6 +449,35 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = uitofp <4 x i16> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -191,6 +506,57 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q3, [x1]
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -239,6 +605,103 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #164]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #156]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #152]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #148]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #140]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #136]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #328]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #188]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #184]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #176]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #172]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -258,6 +721,22 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -271,6 +750,28 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -288,6 +789,43 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x half>
   ret <8 x half> %res
@@ -312,6 +850,76 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v16i32_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = uitofp <16 x i32> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -330,6 +938,18 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -342,6 +962,22 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -355,6 +991,32 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -374,6 +1036,20 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -390,6 +1066,28 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = uitofp <4 x i32> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -414,6 +1112,42 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ucvtf d1, d0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -440,6 +1174,21 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -460,6 +1209,29 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x half>
   ret <4 x half> %res
@@ -493,6 +1265,47 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x half>
   ret <8 x half> %res
@@ -511,6 +1324,18 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -528,6 +1353,23 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x float>
   ret <4 x float> %res
@@ -552,6 +1394,36 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    ucvtf s1, x9
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -570,6 +1442,18 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) {
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf d1, x9
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = uitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -583,6 +1467,24 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf d1, x9
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf d1, x9
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = uitofp <4 x i64> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -601,6 +1503,30 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -613,6 +1539,48 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x half>
   store <8 x half> %res, ptr %b
@@ -628,6 +1596,80 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x half>
   store <16 x half> %res, ptr %b
@@ -646,6 +1688,19 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -659,6 +1714,25 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i16> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -675,6 +1749,38 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -699,6 +1805,62 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    scvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x float>
   store <16 x float> %res, ptr %b
@@ -720,6 +1882,20 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i16> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -737,6 +1913,33 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %res = sitofp <4 x i16> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -765,6 +1968,53 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q3, [x1]
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 160
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -813,6 +2063,96 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #336
+; NONEON-NOSVE-NEXT:    str x29, [sp, #320] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 336
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #320] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #72]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldp d2, d1, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    scvtf d1, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
+; NONEON-NOSVE-NEXT:    stp q3, q4, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q6, q7, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q5, q2, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #336
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -832,6 +2172,22 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #12]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -845,6 +2201,28 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x half>
   ret <4 x half> %res
 }
@@ -862,6 +2240,43 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    scvtf s0, w9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x half>
   ret <8 x half> %res
@@ -879,6 +2294,18 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -891,6 +2318,22 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <4 x i32> %op1 to <4 x float>
   ret <4 x float> %res
 }
@@ -904,6 +2347,32 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s1, w9
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x float>
   store <8 x float> %res, ptr %b
@@ -923,6 +2392,19 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i32> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -939,6 +2421,26 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = sitofp <4 x i32> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -963,6 +2465,38 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
 ; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-128]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x double>
   store <8 x double> %res, ptr %b
@@ -1007,6 +2541,72 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q4, q0, [x1, #32]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    str x29, [sp, #256] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #256] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp d0, d2, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    scvtf d2, w9
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    scvtf d0, w9
+; NONEON-NOSVE-NEXT:    str d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
+; NONEON-NOSVE-NEXT:    scvtf d1, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldp q4, q6, [sp, #208]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr q7, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #160]
+; NONEON-NOSVE-NEXT:    scvtf d1, w9
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr q5, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x1, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = sitofp <16 x i32> %op1 to <16 x double>
   store <16 x double> %res, ptr %b
@@ -1033,6 +2633,21 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x half>
   ret <2 x half> %res
 }
@@ -1053,6 +2668,29 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    scvtf s0, x9
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x half>
   ret <4 x half> %res
@@ -1071,6 +2709,18 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x float>
   ret <2 x float> %res
 }
@@ -1088,6 +2738,23 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    scvtf s1, x9
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x float>
   ret <4 x float> %res
@@ -1105,6 +2772,18 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) {
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    scvtf d1, x9
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sitofp <2 x i64> %op1 to <2 x double>
   ret <2 x double> %res
 }
@@ -1118,6 +2797,24 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    scvtf d1, x9
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    scvtf d1, x9
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sitofp <4 x i64> %op1 to <4 x double>
   store <4 x double> %res, ptr %b
@@ -1130,6 +2827,13 @@ define half @scvtf_i16_f16(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i16_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to half
   ret half %3
@@ -1141,6 +2845,12 @@ define float @scvtf_i16_f32(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf s0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i16_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to float
   ret float %3
@@ -1152,6 +2862,12 @@ define double @scvtf_i16_f64(ptr %0) {
 ; CHECK-NEXT:    ldrsh w8, [x0]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i16_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrsh w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = sitofp i16 %2 to double
   ret double %3
@@ -1163,6 +2879,13 @@ define half @scvtf_i32_f16(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i32_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to half
   ret half %3
@@ -1174,6 +2897,12 @@ define float @scvtf_i32_f32(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf s0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i32_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to float
   ret float %3
@@ -1185,6 +2914,12 @@ define double @scvtf_i32_f64(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i32_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf d0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = sitofp i32 %2 to double
   ret double %3
@@ -1196,6 +2931,13 @@ define half @scvtf_i64_f16(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf h0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i64_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to half
   ret half %3
@@ -1207,6 +2949,12 @@ define float @scvtf_i64_f32(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf s0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i64_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf s0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to float
   ret float %3
@@ -1218,6 +2966,12 @@ define double @scvtf_i64_f64(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    scvtf d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: scvtf_i64_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = sitofp i64 %2 to double
   ret double %3
@@ -1229,6 +2983,13 @@ define half @ucvtf_i16_f16(ptr %0) {
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ucvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i16_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to half
   ret half %3
@@ -1240,6 +3001,12 @@ define float @ucvtf_i16_f32(ptr %0) {
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ucvtf s0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i16_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to float
   ret float %3
@@ -1251,6 +3018,12 @@ define double @ucvtf_i16_f64(ptr %0) {
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i16_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr h0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to double
   ret double %3
@@ -1262,6 +3035,13 @@ define half @ucvtf_i32_f16(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ucvtf h0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i32_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to half
   ret half %3
@@ -1273,6 +3053,12 @@ define float @ucvtf_i32_f32(ptr %0) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ucvtf s0, w8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i32_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to float
   ret float %3
@@ -1284,6 +3070,12 @@ define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i32_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to double
   ret double %3
@@ -1295,6 +3087,13 @@ define half @ucvtf_i64_f16(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf h0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i64_f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to half
   ret half %3
@@ -1306,6 +3105,12 @@ define float @ucvtf_i64_f32(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf s0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i64_f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to float
   ret float %3
@@ -1317,6 +3122,12 @@ define double @ucvtf_i64_f64(ptr %0) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ucvtf d0, x8
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ucvtf_i64_f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
   %2 = load i64, ptr %0, align 64
   %3 = uitofp i64 %2 to double
   ret double %3
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 3775a64a89a0c..af15d5f67ad15 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,44 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, ne
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w9, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2
   ret <4 x i8> %sel
 }
@@ -36,6 +75,72 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #7]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #21]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    tst w13, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w13, w17, w16, ne
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w15, #0xff
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #5]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    csel w13, w16, w13, ne
+; NONEON-NOSVE-NEXT:    tst w14, #0xff
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w13, w15, w13, ne
+; NONEON-NOSVE-NEXT:    tst w12, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w12, w14, w13, ne
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    tst w11, #0xff
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    csel w11, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w10, #0xff
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w10, w12, w11, ne
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #1]
+; NONEON-NOSVE-NEXT:    tst w9, #0xff
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #9]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xff
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2
   ret <8 x i8> %sel
 }
@@ -54,6 +159,128 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #47]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #45]
+; NONEON-NOSVE-NEXT:    sbfx w2, w2, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w4, w4, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #44]
+; NONEON-NOSVE-NEXT:    sbfx w3, w3, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #42]
+; NONEON-NOSVE-NEXT:    tst w2, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w1, w1, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #41]
+; NONEON-NOSVE-NEXT:    csel w2, w6, w5, ne
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #14]
+; NONEON-NOSVE-NEXT:    tst w4, #0xff
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w0, w0, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w18, w18, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w17, w17, #0, #1
+; NONEON-NOSVE-NEXT:    csel w2, w5, w2, ne
+; NONEON-NOSVE-NEXT:    tst w3, #0xff
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #37]
+; NONEON-NOSVE-NEXT:    csel w2, w4, w2, ne
+; NONEON-NOSVE-NEXT:    tst w1, #0xff
+; NONEON-NOSVE-NEXT:    sbfx w16, w16, #0, #1
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #28]
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #36]
+; NONEON-NOSVE-NEXT:    csel w1, w3, w2, ne
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #11]
+; NONEON-NOSVE-NEXT:    tst w0, #0xff
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #27]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    csel w0, w2, w1, ne
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #10]
+; NONEON-NOSVE-NEXT:    tst w18, #0xff
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #26]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    csel w18, w1, w0, ne
+; NONEON-NOSVE-NEXT:    ldrb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    tst w17, #0xff
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #25]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w17, w0, w18, ne
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #8]
+; NONEON-NOSVE-NEXT:    tst w16, #0xff
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w16, w18, w17, ne
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #7]
+; NONEON-NOSVE-NEXT:    tst w15, #0xff
+; NONEON-NOSVE-NEXT:    strb w16, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w15, w17, w16, ne
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w14, #0xff
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w14, w16, w15, ne
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #5]
+; NONEON-NOSVE-NEXT:    tst w13, #0xff
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #21]
+; NONEON-NOSVE-NEXT:    csel w13, w15, w14, ne
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w12, #0xff
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w12, w14, w13, ne
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    tst w11, #0xff
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #19]
+; NONEON-NOSVE-NEXT:    csel w11, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w10, #0xff
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w10, w12, w11, ne
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #1]
+; NONEON-NOSVE-NEXT:    tst w9, #0xff
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #17]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xff
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2
   ret <16 x i8> %sel
 }
@@ -70,6 +297,208 @@ define void @select_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.b, p0, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #208
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 208
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -8
+; NONEON-NOSVE-NEXT:    .cfi_offset w20, -16
+; NONEON-NOSVE-NEXT:    .cfi_offset w21, -24
+; NONEON-NOSVE-NEXT:    .cfi_offset w22, -32
+; NONEON-NOSVE-NEXT:    .cfi_offset w23, -40
+; NONEON-NOSVE-NEXT:    .cfi_offset w24, -48
+; NONEON-NOSVE-NEXT:    .cfi_offset w25, -56
+; NONEON-NOSVE-NEXT:    .cfi_offset w26, -64
+; NONEON-NOSVE-NEXT:    .cfi_offset w27, -72
+; NONEON-NOSVE-NEXT:    .cfi_offset w28, -80
+; NONEON-NOSVE-NEXT:    .cfi_offset w30, -88
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -96
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #19]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, eq
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #37]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #21]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    csel w8, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w16, w14
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #22]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #4] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    csel w8, w16, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w18
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #23]
+; NONEON-NOSVE-NEXT:    csel w12, w1, w18, eq
+; NONEON-NOSVE-NEXT:    cmp w2, w13
+; NONEON-NOSVE-NEXT:    ldrb w18, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w13, w2, w13, eq
+; NONEON-NOSVE-NEXT:    cmp w16, w14
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #25]
+; NONEON-NOSVE-NEXT:    csel w14, w16, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w18
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #26]
+; NONEON-NOSVE-NEXT:    csel w16, w1, w18, eq
+; NONEON-NOSVE-NEXT:    ldrb w1, [sp, #42]
+; NONEON-NOSVE-NEXT:    cmp w5, w2
+; NONEON-NOSVE-NEXT:    csel w18, w5, w2, eq
+; NONEON-NOSVE-NEXT:    ldrb w2, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w5, [sp, #27]
+; NONEON-NOSVE-NEXT:    cmp w6, w1
+; NONEON-NOSVE-NEXT:    ldrb w19, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #45]
+; NONEON-NOSVE-NEXT:    csel w1, w6, w1, eq
+; NONEON-NOSVE-NEXT:    ldrb w6, [sp, #44]
+; NONEON-NOSVE-NEXT:    cmp w5, w2
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #29]
+; NONEON-NOSVE-NEXT:    str w8, [sp] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    csel w2, w5, w2, eq
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    cmp w19, w6
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w21, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #31]
+; NONEON-NOSVE-NEXT:    csel w5, w19, w6, eq
+; NONEON-NOSVE-NEXT:    cmp w30, w29
+; NONEON-NOSVE-NEXT:    ldrb w22, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w6, w30, w29, eq
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #49]
+; NONEON-NOSVE-NEXT:    csel w19, w8, w9, eq
+; NONEON-NOSVE-NEXT:    cmp w10, w21
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    csel w21, w10, w21, eq
+; NONEON-NOSVE-NEXT:    cmp w11, w22
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #51]
+; NONEON-NOSVE-NEXT:    csel w22, w11, w22, eq
+; NONEON-NOSVE-NEXT:    cmp w29, w28
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    csel w11, w29, w28, eq
+; NONEON-NOSVE-NEXT:    cmp w8, w27
+; NONEON-NOSVE-NEXT:    ldrb w24, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #53]
+; NONEON-NOSVE-NEXT:    csel w8, w8, w27, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w26
+; NONEON-NOSVE-NEXT:    ldrb w23, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #54]
+; NONEON-NOSVE-NEXT:    csel w9, w9, w26, eq
+; NONEON-NOSVE-NEXT:    cmp w10, w25
+; NONEON-NOSVE-NEXT:    ldrb w20, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #55]
+; NONEON-NOSVE-NEXT:    csel w10, w10, w25, eq
+; NONEON-NOSVE-NEXT:    cmp w28, w24
+; NONEON-NOSVE-NEXT:    ldrb w7, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #56]
+; NONEON-NOSVE-NEXT:    csel w24, w28, w24, eq
+; NONEON-NOSVE-NEXT:    cmp w27, w23
+; NONEON-NOSVE-NEXT:    ldrb w4, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #57]
+; NONEON-NOSVE-NEXT:    csel w23, w27, w23, eq
+; NONEON-NOSVE-NEXT:    cmp w26, w20
+; NONEON-NOSVE-NEXT:    ldrb w3, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #58]
+; NONEON-NOSVE-NEXT:    csel w20, w26, w20, eq
+; NONEON-NOSVE-NEXT:    cmp w25, w7
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #59]
+; NONEON-NOSVE-NEXT:    csel w7, w25, w7, eq
+; NONEON-NOSVE-NEXT:    cmp w28, w4
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #60]
+; NONEON-NOSVE-NEXT:    csel w4, w28, w4, eq
+; NONEON-NOSVE-NEXT:    cmp w27, w3
+; NONEON-NOSVE-NEXT:    csel w3, w27, w3, eq
+; NONEON-NOSVE-NEXT:    cmp w26, w17
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w27, [sp, #61]
+; NONEON-NOSVE-NEXT:    csel w17, w26, w17, eq
+; NONEON-NOSVE-NEXT:    cmp w25, w15
+; NONEON-NOSVE-NEXT:    ldrb w26, [sp, #78]
+; NONEON-NOSVE-NEXT:    csel w15, w25, w15, eq
+; NONEON-NOSVE-NEXT:    ldrb w25, [sp, #62]
+; NONEON-NOSVE-NEXT:    cmp w27, w28
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w30, [sp, #63]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #99]
+; NONEON-NOSVE-NEXT:    csel w27, w27, w28, eq
+; NONEON-NOSVE-NEXT:    cmp w25, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    csel w25, w25, w26, eq
+; NONEON-NOSVE-NEXT:    cmp w30, w29
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    csel w26, w30, w29, eq
+; NONEON-NOSVE-NEXT:    ldrb w28, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w29, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w26, [sp, #111]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    cmp w29, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w25, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w27, [sp, #109]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    csel w8, w29, w28, eq
+; NONEON-NOSVE-NEXT:    strb w15, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w17, [sp, #107]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w3, [sp, #106]
+; NONEON-NOSVE-NEXT:    strb w4, [sp, #105]
+; NONEON-NOSVE-NEXT:    strb w7, [sp, #104]
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #103]
+; NONEON-NOSVE-NEXT:    strb w23, [sp, #102]
+; NONEON-NOSVE-NEXT:    strb w24, [sp, #101]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #100]
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #97]
+; NONEON-NOSVE-NEXT:    strb w22, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w21, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w19, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #93]
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #92]
+; NONEON-NOSVE-NEXT:    strb w2, [sp, #91]
+; NONEON-NOSVE-NEXT:    strb w1, [sp, #90]
+; NONEON-NOSVE-NEXT:    strb w18, [sp, #89]
+; NONEON-NOSVE-NEXT:    strb w16, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w14, [sp, #87]
+; NONEON-NOSVE-NEXT:    strb w13, [sp, #86]
+; NONEON-NOSVE-NEXT:    strb w12, [sp, #85]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #82]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #208
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %mask = icmp eq <32 x i8> %op1, %op2
@@ -92,6 +521,29 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    csel w8, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w8, ne
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2
   ret <2 x i16> %sel
 }
@@ -110,6 +562,44 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, ne
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w9, ne
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2
   ret <4 x i16> %sel
 }
@@ -129,6 +619,72 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #47]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #45]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #44]
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    tst w13, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    csel w13, w17, w16, ne
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #12]
+; NONEON-NOSVE-NEXT:    tst w15, #0xffff
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    csel w13, w16, w13, ne
+; NONEON-NOSVE-NEXT:    tst w14, #0xffff
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    csel w13, w15, w13, ne
+; NONEON-NOSVE-NEXT:    tst w12, #0xffff
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w12, w14, w13, ne
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #6]
+; NONEON-NOSVE-NEXT:    tst w11, #0xffff
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #22]
+; NONEON-NOSVE-NEXT:    csel w11, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    tst w10, #0xffff
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w10, w12, w11, ne
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2]
+; NONEON-NOSVE-NEXT:    tst w9, #0xffff
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #18]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp]
+; NONEON-NOSVE-NEXT:    tst w8, #0xffff
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2
   ret <8 x i16> %sel
 }
@@ -145,6 +701,102 @@ define void @select_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #112
+; NONEON-NOSVE-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 112
+; NONEON-NOSVE-NEXT:    .cfi_offset w19, -16
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #12]
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w15, w14
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #10]
+; NONEON-NOSVE-NEXT:    csel w14, w15, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w17, w16
+; NONEON-NOSVE-NEXT:    csel w16, w17, w16, eq
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #28]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #14]
+; NONEON-NOSVE-NEXT:    csel w12, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w17
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #32]
+; NONEON-NOSVE-NEXT:    csel w17, w1, w17, eq
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp w4, w3
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #52]
+; NONEON-NOSVE-NEXT:    csel w3, w4, w3, eq
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #36]
+; NONEON-NOSVE-NEXT:    cmp w5, w1
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #38]
+; NONEON-NOSVE-NEXT:    csel w1, w5, w1, eq
+; NONEON-NOSVE-NEXT:    cmp w7, w6
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #40]
+; NONEON-NOSVE-NEXT:    csel w6, w7, w6, eq
+; NONEON-NOSVE-NEXT:    cmp w4, w2
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #42]
+; NONEON-NOSVE-NEXT:    csel w2, w4, w2, eq
+; NONEON-NOSVE-NEXT:    cmp w19, w13
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #44]
+; NONEON-NOSVE-NEXT:    csel w13, w19, w13, eq
+; NONEON-NOSVE-NEXT:    cmp w5, w18
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #46]
+; NONEON-NOSVE-NEXT:    csel w18, w5, w18, eq
+; NONEON-NOSVE-NEXT:    cmp w7, w15
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp]
+; NONEON-NOSVE-NEXT:    csel w15, w7, w15, eq
+; NONEON-NOSVE-NEXT:    cmp w4, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    csel w11, w4, w11, eq
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w19, w10
+; NONEON-NOSVE-NEXT:    csel w10, w19, w10, eq
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    cmp w5, w4
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #94]
+; NONEON-NOSVE-NEXT:    csel w8, w5, w4, eq
+; NONEON-NOSVE-NEXT:    strh w15, [sp, #90]
+; NONEON-NOSVE-NEXT:    strh w18, [sp, #88]
+; NONEON-NOSVE-NEXT:    strh w13, [sp, #86]
+; NONEON-NOSVE-NEXT:    strh w2, [sp, #84]
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w3, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w17, [sp, #76]
+; NONEON-NOSVE-NEXT:    strh w12, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w16, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w14, [sp, #70]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #68]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #112
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %mask = icmp eq <16 x i16> %op1, %op2
@@ -167,6 +819,29 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    csel w8, w11, w10, ne
+; NONEON-NOSVE-NEXT:    ldr w10, [sp]
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w8, ne
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2
   ret <2 x i32> %sel
 }
@@ -186,6 +861,44 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    cmp w9, #0
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    csel w9, w13, w12, ne
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w11, #0
+; NONEON-NOSVE-NEXT:    str w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    csel w9, w12, w9, ne
+; NONEON-NOSVE-NEXT:    cmp w10, #0
+; NONEON-NOSVE-NEXT:    ldr w10, [sp]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    csel w9, w11, w9, ne
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    str w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w10, w9, ne
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2
   ret <4 x i32> %sel
 }
@@ -202,6 +915,47 @@ define void @select_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldp w12, w11, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w10, w9
+; NONEON-NOSVE-NEXT:    csel w9, w10, w9, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w12
+; NONEON-NOSVE-NEXT:    ldp w15, w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    csel w12, w13, w12, eq
+; NONEON-NOSVE-NEXT:    cmp w14, w11
+; NONEON-NOSVE-NEXT:    ldp w10, w13, [sp, #32]
+; NONEON-NOSVE-NEXT:    csel w11, w14, w11, eq
+; NONEON-NOSVE-NEXT:    ldp w17, w14, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w18, w1, [sp, #40]
+; NONEON-NOSVE-NEXT:    cmp w10, w15
+; NONEON-NOSVE-NEXT:    stp w12, w11, [sp, #72]
+; NONEON-NOSVE-NEXT:    csel w10, w10, w15, eq
+; NONEON-NOSVE-NEXT:    cmp w13, w16
+; NONEON-NOSVE-NEXT:    ldr w15, [sp]
+; NONEON-NOSVE-NEXT:    csel w13, w13, w16, eq
+; NONEON-NOSVE-NEXT:    cmp w18, w17
+; NONEON-NOSVE-NEXT:    csel w16, w18, w17, eq
+; NONEON-NOSVE-NEXT:    cmp w1, w14
+; NONEON-NOSVE-NEXT:    stp w10, w13, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel w10, w1, w14, eq
+; NONEON-NOSVE-NEXT:    cmp w15, w8
+; NONEON-NOSVE-NEXT:    csel w8, w15, w8, eq
+; NONEON-NOSVE-NEXT:    stp w16, w10, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %mask = icmp eq <8 x i32> %op1, %op2
@@ -223,6 +977,19 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d1
+; NONEON-NOSVE-NEXT:    fmov x9, d0
+; NONEON-NOSVE-NEXT:    tst w0, #0x1
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, ne
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2
   ret <1 x i64> %sel
 }
@@ -242,6 +1009,29 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) {
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp, #8]
+; NONEON-NOSVE-NEXT:    sbfx x8, x8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx x9, x9, #0, #1
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    csel x8, x11, x10, ne
+; NONEON-NOSVE-NEXT:    ldr x10, [sp]
+; NONEON-NOSVE-NEXT:    cmp x9, #0
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel x8, x10, x8, ne
+; NONEON-NOSVE-NEXT:    str x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2
   ret <2 x i64> %sel
 }
@@ -258,6 +1048,34 @@ define void @select_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sel z1.d, p0, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: select_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q3, [x1]
+; NONEON-NOSVE-NEXT:    ldp q1, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-96]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x13, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp x10, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    cmp x9, x8
+; NONEON-NOSVE-NEXT:    csel x8, x9, x8, eq
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, eq
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr x11, [sp]
+; NONEON-NOSVE-NEXT:    cmp x13, x12
+; NONEON-NOSVE-NEXT:    csel x12, x13, x12, eq
+; NONEON-NOSVE-NEXT:    cmp x11, x10
+; NONEON-NOSVE-NEXT:    stp x9, x12, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel x9, x11, x10, eq
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %mask = icmp eq <4 x i64> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 0b6152340f65a..66d544d0acbf5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -33,19 +33,23 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    add x0, sp, #12
+; NONEON-NOSVE-NEXT:    add x0, sp, #28
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ushll v0.8h, v0.8b, #0
-; NONEON-NOSVE-NEXT:    umov w8, v0.h[2]
-; NONEON-NOSVE-NEXT:    umov w9, v0.h[0]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    strb w8, [x19, #1]
 ; NONEON-NOSVE-NEXT:    strb w9, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [4 x i8]
   call void @def(ptr %alloc)
@@ -88,21 +92,25 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v6i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    add x0, sp, #8
+; NONEON-NOSVE-NEXT:    add x0, sp, #24
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    add x9, x19, #2
-; NONEON-NOSVE-NEXT:    rev16 v1.16b, v0.16b
-; NONEON-NOSVE-NEXT:    xtn v1.8b, v1.8h
-; NONEON-NOSVE-NEXT:    str s1, [sp, #4]
-; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
-; NONEON-NOSVE-NEXT:    st1 { v0.b }[5], [x9]
-; NONEON-NOSVE-NEXT:    strh w8, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [x19, #2]
+; NONEON-NOSVE-NEXT:    strh w9, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [6 x i8]
   call void @def(ptr %alloc)
@@ -135,18 +143,38 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v32i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #48
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #112
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #96] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    mov x0, sp
+; NONEON-NOSVE-NEXT:    add x0, sp, #64
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp]
-; NONEON-NOSVE-NEXT:    add x8, x19, #8
-; NONEON-NOSVE-NEXT:    xtn v0.8b, v0.8h
-; NONEON-NOSVE-NEXT:    st1 { v1.b }[0], [x8]
-; NONEON-NOSVE-NEXT:    str d0, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [x19, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [x19]
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #112
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [32 x i8]
   call void @def(ptr %alloc)
@@ -179,18 +207,26 @@ define void @alloc_v8f64(ptr %st_ptr) nounwind {
 ;
 ; NONEON-NOSVE-LABEL: alloc_v8f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #80
-; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #176
+; NONEON-NOSVE-NEXT:    stp x30, x19, [sp, #160] // 16-byte Folded Spill
 ; NONEON-NOSVE-NEXT:    mov x19, x0
-; NONEON-NOSVE-NEXT:    mov x0, sp
+; NONEON-NOSVE-NEXT:    add x0, sp, #96
 ; NONEON-NOSVE-NEXT:    bl def
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp]
-; NONEON-NOSVE-NEXT:    zip1 v0.2d, v1.2d, v0.2d
-; NONEON-NOSVE-NEXT:    zip1 v1.2d, v3.2d, v2.2d
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x19]
-; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
-; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp x30, x19, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #176
 ; NONEON-NOSVE-NEXT:    ret
   %alloc = alloca [8 x double]
   call void @def(ptr %alloc)
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
index 918f0ccc0cf6a..3b83f982b6bfc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -18,6 +19,72 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) {
 ; CHECK-NEXT:    stp q2, q5, [x0, #32]
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q4, q2, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
   %1 = load <16 x i32>, ptr %arg2, align 256
@@ -42,6 +109,75 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
 ; CHECK-NEXT:    stp q3, q4, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test2:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 144
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp q4, q2, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
   %1 = load <16 x i32>, ptr %arg2, align 256
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
index 8c69d5b0bb375..c97a3c2e721a3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -11,6 +12,22 @@ define <4 x i8> @load_v4i8(ptr %a) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #1]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i8>, ptr %a
   ret <4 x i8> %load
 }
@@ -20,6 +37,11 @@ define <8 x i8> @load_v8i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i8>, ptr %a
   ret <8 x i8> %load
 }
@@ -29,6 +51,11 @@ define <16 x i8> @load_v16i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x i8>, ptr %a
   ret <16 x i8> %load
 }
@@ -38,6 +65,11 @@ define <32 x i8> @load_v32i8(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <32 x i8>, ptr %a
   ret <32 x i8> %load
 }
@@ -49,6 +81,18 @@ define <2 x i16> @load_v2i16(ptr %a) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i16>, ptr %a
   ret <2 x i16> %load
 }
@@ -58,6 +102,16 @@ define <2 x half> @load_v2f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x half>, ptr %a
   ret <2 x half> %load
 }
@@ -67,6 +121,11 @@ define <4 x i16> @load_v4i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i16>, ptr %a
   ret <4 x i16> %load
 }
@@ -76,6 +135,11 @@ define <4 x half> @load_v4f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x half>, ptr %a
   ret <4 x half> %load
 }
@@ -85,6 +149,11 @@ define <8 x i16> @load_v8i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i16>, ptr %a
   ret <8 x i16> %load
 }
@@ -94,6 +163,11 @@ define <8 x half> @load_v8f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x half>, ptr %a
   ret <8 x half> %load
 }
@@ -103,6 +177,11 @@ define <16 x i16> @load_v16i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x i16>, ptr %a
   ret <16 x i16> %load
 }
@@ -112,6 +191,11 @@ define <16 x half> @load_v16f16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <16 x half>, ptr %a
   ret <16 x half> %load
 }
@@ -121,6 +205,11 @@ define <2 x i32> @load_v2i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i32>, ptr %a
   ret <2 x i32> %load
 }
@@ -130,6 +219,11 @@ define <2 x float> @load_v2f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x float>, ptr %a
   ret <2 x float> %load
 }
@@ -139,6 +233,11 @@ define <4 x i32> @load_v4i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i32>, ptr %a
   ret <4 x i32> %load
 }
@@ -148,6 +247,11 @@ define <4 x float> @load_v4f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x float>, ptr %a
   ret <4 x float> %load
 }
@@ -157,6 +261,11 @@ define <8 x i32> @load_v8i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x i32>, ptr %a
   ret <8 x i32> %load
 }
@@ -166,6 +275,11 @@ define <8 x float> @load_v8f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <8 x float>, ptr %a
   ret <8 x float> %load
 }
@@ -175,6 +289,11 @@ define <1 x i64> @load_v1i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <1 x i64>, ptr %a
   ret <1 x i64> %load
 }
@@ -184,6 +303,11 @@ define <1 x double> @load_v1f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <1 x double>, ptr %a
   ret <1 x double> %load
 }
@@ -193,6 +317,11 @@ define <2 x i64> @load_v2i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x i64>, ptr %a
   ret <2 x i64> %load
 }
@@ -202,6 +331,11 @@ define <2 x double> @load_v2f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <2 x double>, ptr %a
   ret <2 x double> %load
 }
@@ -211,6 +345,11 @@ define <4 x i64> @load_v4i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x i64>, ptr %a
   ret <4 x i64> %load
 }
@@ -220,6 +359,11 @@ define <4 x double> @load_v4f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: load_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %load = load <4 x double>, ptr %a
   ret <4 x double> %load
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
index ef52eadc5d3b0..9e1edb817c459 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -17,6 +18,21 @@ define i8 @andv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -29,6 +45,29 @@ define i8 @andv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w10, w12, w10
+; NONEON-NOSVE-NEXT:    and w8, w8, w14
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -41,6 +80,44 @@ define i8 @andv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w11, w14, w13
+; NONEON-NOSVE-NEXT:    and w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    and w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w12, w12, w14
+; NONEON-NOSVE-NEXT:    and w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w9, w10, w9
+; NONEON-NOSVE-NEXT:    and w10, w12, w16
+; NONEON-NOSVE-NEXT:    and w8, w8, w15
+; NONEON-NOSVE-NEXT:    and w9, w9, w10
+; NONEON-NOSVE-NEXT:    and w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -54,6 +131,77 @@ define i8 @andv_v32i8(ptr %a) {
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    and w9, w11, w10
+; NONEON-NOSVE-NEXT:    and w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    and w11, w15, w14
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    and w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    and w14, w15, w14
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w9, w9, w14
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w10, w10, w12
+; NONEON-NOSVE-NEXT:    and w11, w16, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, w11
+; NONEON-NOSVE-NEXT:    and w11, w17, w13
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -67,6 +215,16 @@ define i16 @andv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -79,6 +237,21 @@ define i16 @andv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -91,6 +264,28 @@ define i16 @andv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w10, w12, w10
+; NONEON-NOSVE-NEXT:    and w8, w8, w14
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -104,6 +299,45 @@ define i16 @andv_v16i16(ptr %a) {
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w9, w11, w10
+; NONEON-NOSVE-NEXT:    and w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w13, w15, w14
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w9, w12, w13
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    and w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    and w10, w14, w10
+; NONEON-NOSVE-NEXT:    and w11, w15, w11
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w8, w8, w9
+; NONEON-NOSVE-NEXT:    and w9, w13, w12
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -117,6 +351,16 @@ define i32 @andv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -129,6 +373,17 @@ define i32 @andv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w0, w10, w8
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -142,6 +397,25 @@ define i32 @andv_v8i32(ptr %a) {
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w10, w8
+; NONEON-NOSVE-NEXT:    and w9, w11, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, w8
+; NONEON-NOSVE-NEXT:    and w10, w14, w12
+; NONEON-NOSVE-NEXT:    and w11, w15, w13
+; NONEON-NOSVE-NEXT:    and w9, w10, w11
+; NONEON-NOSVE-NEXT:    and w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -155,6 +429,14 @@ define i64 @andv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    and x0, x9, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -168,6 +450,18 @@ define i64 @andv_v4i64(ptr %a) {
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: andv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    and x8, x10, x8
+; NONEON-NOSVE-NEXT:    and x9, x11, x9
+; NONEON-NOSVE-NEXT:    and x0, x9, x8
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -185,6 +479,21 @@ define i8 @eorv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -197,6 +506,29 @@ define i8 @eorv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w10, w12, w10
+; NONEON-NOSVE-NEXT:    eor w8, w8, w14
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -209,6 +541,44 @@ define i8 @eorv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w11, w14, w13
+; NONEON-NOSVE-NEXT:    eor w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    eor w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    eor w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    eor w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w12, w12, w14
+; NONEON-NOSVE-NEXT:    eor w8, w8, w11
+; NONEON-NOSVE-NEXT:    eor w9, w10, w9
+; NONEON-NOSVE-NEXT:    eor w10, w12, w16
+; NONEON-NOSVE-NEXT:    eor w8, w8, w15
+; NONEON-NOSVE-NEXT:    eor w9, w9, w10
+; NONEON-NOSVE-NEXT:    eor w8, w8, w13
+; NONEON-NOSVE-NEXT:    eor w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -222,6 +592,77 @@ define i8 @eorv_v32i8(ptr %a) {
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    eor w9, w11, w10
+; NONEON-NOSVE-NEXT:    eor w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    eor w11, w15, w14
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    eor w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    eor w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    eor w14, w15, w14
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    eor w9, w9, w14
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w10, w10, w12
+; NONEON-NOSVE-NEXT:    eor w11, w16, w11
+; NONEON-NOSVE-NEXT:    eor w10, w10, w11
+; NONEON-NOSVE-NEXT:    eor w11, w17, w13
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -235,6 +676,16 @@ define i16 @eorv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -247,6 +698,21 @@ define i16 @eorv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -259,6 +725,28 @@ define i16 @eorv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w10, w12, w10
+; NONEON-NOSVE-NEXT:    eor w8, w8, w14
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -272,6 +760,45 @@ define i16 @eorv_v16i16(ptr %a) {
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    eor w9, w11, w10
+; NONEON-NOSVE-NEXT:    eor w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    eor w13, w15, w14
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w9, w12, w13
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    eor w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    eor w10, w14, w10
+; NONEON-NOSVE-NEXT:    eor w11, w15, w11
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w8, w8, w9
+; NONEON-NOSVE-NEXT:    eor w9, w13, w12
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -285,6 +812,16 @@ define i32 @eorv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -297,6 +834,17 @@ define i32 @eorv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    eor w10, w11, w10
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w0, w10, w8
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -310,6 +858,25 @@ define i32 @eorv_v8i32(ptr %a) {
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    eor w8, w10, w8
+; NONEON-NOSVE-NEXT:    eor w9, w11, w9
+; NONEON-NOSVE-NEXT:    eor w8, w9, w8
+; NONEON-NOSVE-NEXT:    eor w10, w14, w12
+; NONEON-NOSVE-NEXT:    eor w11, w15, w13
+; NONEON-NOSVE-NEXT:    eor w9, w10, w11
+; NONEON-NOSVE-NEXT:    eor w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -323,6 +890,14 @@ define i64 @eorv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    eor x0, x9, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -336,6 +911,18 @@ define i64 @eorv_v4i64(ptr %a) {
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: eorv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    eor x8, x10, x8
+; NONEON-NOSVE-NEXT:    eor x9, x11, x9
+; NONEON-NOSVE-NEXT:    eor x0, x9, x8
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
   ret i64 %res
@@ -353,6 +940,21 @@ define i8 @orv_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a)
   ret i8 %res
 }
@@ -365,6 +967,29 @@ define i8 @orv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w10, w12, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w14
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
   ret i8 %res
 }
@@ -377,6 +1002,44 @@ define i8 @orv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w11, w14, w13
+; NONEON-NOSVE-NEXT:    orr w9, w12, w9
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w10, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w16
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w12, w12, w15
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w8, w13, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w12, w12, w14
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    orr w9, w10, w9
+; NONEON-NOSVE-NEXT:    orr w10, w12, w16
+; NONEON-NOSVE-NEXT:    orr w8, w8, w15
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    orr w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
   ret i8 %res
 }
@@ -390,6 +1053,77 @@ define i8 @orv_v32i8(ptr %a) {
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #2]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #3]
+; NONEON-NOSVE-NEXT:    orr w9, w11, w10
+; NONEON-NOSVE-NEXT:    orr w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    orr w11, w15, w14
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w10, w14, w10
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #7]
+; NONEON-NOSVE-NEXT:    orr w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #24]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w10, w14, w13
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #9]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w11, w15, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #26]
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w10, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w16, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w11, w12, w11
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w17, [sp, #15]
+; NONEON-NOSVE-NEXT:    orr w10, w13, w10
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #13]
+; NONEON-NOSVE-NEXT:    orr w14, w15, w14
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #30]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w14
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #31]
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w10, w10, w12
+; NONEON-NOSVE-NEXT:    orr w11, w16, w11
+; NONEON-NOSVE-NEXT:    orr w10, w10, w11
+; NONEON-NOSVE-NEXT:    orr w11, w17, w13
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
   ret i8 %res
@@ -403,6 +1137,16 @@ define i16 @orv_v2i16(<2 x i16> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a)
   ret i16 %res
 }
@@ -415,6 +1159,21 @@ define i16 @orv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w0, w10, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
   ret i16 %res
 }
@@ -427,6 +1186,28 @@ define i16 @orv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w10, w12, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w14
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
   ret i16 %res
 }
@@ -440,6 +1221,45 @@ define i16 @orv_v16i16(ptr %a) {
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #4]
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #6]
+; NONEON-NOSVE-NEXT:    orr w9, w11, w10
+; NONEON-NOSVE-NEXT:    orr w12, w13, w12
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    orr w13, w15, w14
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w9, w12, w13
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #12]
+; NONEON-NOSVE-NEXT:    orr w14, w17, w16
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #14]
+; NONEON-NOSVE-NEXT:    orr w10, w14, w10
+; NONEON-NOSVE-NEXT:    orr w11, w15, w11
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    orr w9, w13, w12
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
   ret i16 %res
@@ -453,6 +1273,16 @@ define i32 @orv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w0, w9, w8
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
   ret i32 %res
 }
@@ -465,6 +1295,17 @@ define i32 @orv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp], #16
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w0, w10, w8
+; NONEON-NOSVE-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   ret i32 %res
 }
@@ -478,6 +1319,25 @@ define i32 @orv_v8i32(ptr %a) {
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w11, w10, [sp]
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #8]
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w9, w11, w9
+; NONEON-NOSVE-NEXT:    orr w8, w9, w8
+; NONEON-NOSVE-NEXT:    orr w10, w14, w12
+; NONEON-NOSVE-NEXT:    orr w11, w15, w13
+; NONEON-NOSVE-NEXT:    orr w9, w10, w11
+; NONEON-NOSVE-NEXT:    orr w0, w8, w9
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
   ret i32 %res
@@ -491,6 +1351,14 @@ define i64 @orv_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp], #16
+; NONEON-NOSVE-NEXT:    orr x0, x9, x8
+; NONEON-NOSVE-NEXT:    ret
   %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
   ret i64 %res
 }
@@ -504,6 +1372,18 @@ define i64 @orv_v4i64(ptr %a) {
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: orv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp], #32
+; NONEON-NOSVE-NEXT:    orr x8, x10, x8
+; NONEON-NOSVE-NEXT:    orr x9, x11, x9
+; NONEON-NOSVE-NEXT:    orr x0, x9, x8
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
   ret i64 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 4f8f8c2e4b244..be335c697707d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -19,6 +20,87 @@ define <4 x i8> @masked_load_v4i8(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    str d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #112]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB0_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0]
+; NONEON-NOSVE-NEXT:    strh wzr, [sp, #110]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_3
+; NONEON-NOSVE-NEXT:    b .LBB0_4
+; NONEON-NOSVE-NEXT:  .LBB0_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI0_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI0_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB0_4
+; NONEON-NOSVE-NEXT:  .LBB0_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:  .LBB0_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB0_7
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
+; NONEON-NOSVE-NEXT:  .LBB0_6: // %else8
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB0_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_6
+; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x i8> @llvm.masked.load.v4i8(ptr %src, i32 8, <4 x i1> %mask, <4 x i8> zeroinitializer)
   ret <4 x i8> %load
 }
@@ -34,6 +116,186 @@ define <8 x i8> @masked_load_v8i8(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    str x29, [sp, #256] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 272
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #240]
+; NONEON-NOSVE-NEXT:    add x9, sp, #176
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #242]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #243]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #241]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #244]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #245]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #246]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #240]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #247]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w11, w8
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x80
+; NONEON-NOSVE-NEXT:    add w10, w8, w10
+; NONEON-NOSVE-NEXT:    and w8, w10, #0xff
+; NONEON-NOSVE-NEXT:    tbz w10, #0, .LBB1_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #239]
+; NONEON-NOSVE-NEXT:    sturh wzr, [x9, #61]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #57]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #232]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_3
+; NONEON-NOSVE-NEXT:    b .LBB1_4
+; NONEON-NOSVE-NEXT:  .LBB1_2:
+; NONEON-NOSVE-NEXT:    adrp x10, .LCPI1_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x10, :lo12:.LCPI1_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB1_4
+; NONEON-NOSVE-NEXT:  .LBB1_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #1]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #224]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #222]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #34]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #42]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #200]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #217]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #216]
+; NONEON-NOSVE-NEXT:  .LBB1_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_12
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_13
+; NONEON-NOSVE-NEXT:  .LBB1_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_14
+; NONEON-NOSVE-NEXT:  .LBB1_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_15
+; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_16
+; NONEON-NOSVE-NEXT:  .LBB1_9: // %else17
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_11
+; NONEON-NOSVE-NEXT:  .LBB1_10: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #7]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:  .LBB1_11: // %else20
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #256] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #183]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #191]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #3]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #11]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #184]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB1_6
+; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #3]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #146]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #155]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #152]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB1_7
+; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #119]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #127]
+; NONEON-NOSVE-NEXT:    ldurh w9, [sp, #117]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    sturh w9, [sp, #125]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #124]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #120]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB1_8
+; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #5]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB1_9
+; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #6]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_10
+; NONEON-NOSVE-NEXT:    b .LBB1_11
   %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %src, i32 8, <8 x i1> %mask, <8 x i8> zeroinitializer)
   ret <8 x i8> %load
 }
@@ -49,6 +311,416 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #1024
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #976]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #984]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1000]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #976]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #992]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #991]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1007]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #990]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1006]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #989]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1005]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #988]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1004]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #987]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1003]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #986]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1002]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #985]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1001]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #983]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #999]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #982]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #998]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #981]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #997]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #980]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #996]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #979]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #995]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #994]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #977]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #993]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #992]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #1010]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1012]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1014]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #1016]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #1018]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #1020]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w12, w13
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w14
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1022]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB2_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #975]
+; NONEON-NOSVE-NEXT:    sturh wzr, [x9, #253]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #249]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #241]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #960]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #960]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_3
+; NONEON-NOSVE-NEXT:    b .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_2:
+; NONEON-NOSVE-NEXT:    adrp x10, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x10, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #944]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #926]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #944]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #942]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #896]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #194]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #210]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #928]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #896]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #929]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #928]
+; NONEON-NOSVE-NEXT:  .LBB2_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_20
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_21
+; NONEON-NOSVE-NEXT:  .LBB2_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_22
+; NONEON-NOSVE-NEXT:  .LBB2_7: // %else11
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_23
+; NONEON-NOSVE-NEXT:  .LBB2_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_24
+; NONEON-NOSVE-NEXT:  .LBB2_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_25
+; NONEON-NOSVE-NEXT:  .LBB2_10: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_26
+; NONEON-NOSVE-NEXT:  .LBB2_11: // %else23
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_27
+; NONEON-NOSVE-NEXT:  .LBB2_12: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_28
+; NONEON-NOSVE-NEXT:  .LBB2_13: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_29
+; NONEON-NOSVE-NEXT:  .LBB2_14: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_30
+; NONEON-NOSVE-NEXT:  .LBB2_15: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_31
+; NONEON-NOSVE-NEXT:  .LBB2_16: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_32
+; NONEON-NOSVE-NEXT:  .LBB2_17: // %else41
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_19
+; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.load43
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #15]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB2_19: // %else44
+; NONEON-NOSVE-NEXT:    add sp, sp, #1024
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #848]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #880]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #863]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #880]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #879]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #832]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #131]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #147]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #848]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #864]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #832]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #866]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #864]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_6
+; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #3]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #784]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #816]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #796]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #816]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #812]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #68]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #84]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #786]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #802]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #784]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #800]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #768]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #803]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #800]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_7
+; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #720]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #752]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #735]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #752]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #751]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #5]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #21]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #720]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #704]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #740]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #736]
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_8
+; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #5]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #656]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #670]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #688]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #686]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #198]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #214]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #660]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #676]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #656]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #640]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #677]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #672]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_9
+; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #592]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #607]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #624]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #623]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #135]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #151]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #596]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #612]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #592]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #576]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #614]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #608]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_10
+; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #7]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #528]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #560]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #534]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #532]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #548]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #528]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #551]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #544]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_11
+; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.load22
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #464]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #479]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #496]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #495]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x9, #9]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur w11, [x9, #25]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #464]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #448]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #488]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #480]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_12
+; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.load25
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #9]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #414]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #430]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #408]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #400]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #384]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #425]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #416]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_13
+; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.load28
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #351]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #367]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #344]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #336]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #320]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #362]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_14
+; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.load31
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #11]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #282]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #280]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #299]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_15
+; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.load34
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #239]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_16
+; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.load37
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #13]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #173]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_17
+; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.load40
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_18
+; NONEON-NOSVE-NEXT:    b .LBB2_19
   %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer)
   ret <16 x i8> %load
 }
@@ -130,6 +802,818 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #2064
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 2080
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2216]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2152]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2272]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2176]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2160]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2024]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2264]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2016]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2031]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2248]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2030]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2029]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2232]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2028]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2224]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2027]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2026]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2200]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2025]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2192]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2023]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2184]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #2022]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2168]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x20
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #2021]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2020]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2019]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2088]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2018]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2136]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2017]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2008]
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #2104]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2000]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #2080]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #2016]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2015]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #2120]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2014]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #2112]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #2013]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #2096]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x10
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #2012]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2011]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2010]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w7, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2009]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w10, w6, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2007]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w5, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2006]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w4, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2005]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w3, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2004]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w2, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2003]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2002]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2001]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #2048]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #2000]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2050]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2048]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2052]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2054]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2056]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2058]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2060]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w12, w13
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    str q0, [sp, #2032]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2034]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2032]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2036]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #2038]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #2040]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #2042]
+; NONEON-NOSVE-NEXT:    add w10, w12, w11
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #2044]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #2062]
+; NONEON-NOSVE-NEXT:    add w13, w13, w14
+; NONEON-NOSVE-NEXT:    add w14, w15, w16
+; NONEON-NOSVE-NEXT:    add w10, w10, w13
+; NONEON-NOSVE-NEXT:    add w11, w14, w11
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #2046]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w8, w12
+; NONEON-NOSVE-NEXT:    add w8, w9, w13
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI3_0
+; NONEON-NOSVE-NEXT:    bfi w8, w10, #16, #16
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    add x9, sp, #1744
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB3_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strb wzr, [sp, #1999]
+; NONEON-NOSVE-NEXT:    sturh wzr, [x9, #253]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #249]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #241]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1984]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1984]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_3
+; NONEON-NOSVE-NEXT:    b .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1936]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1968]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1950]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1968]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1966]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1920]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #194]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #210]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1936]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1952]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1920]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1953]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1952]
+; NONEON-NOSVE-NEXT:  .LBB3_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_36
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_37
+; NONEON-NOSVE-NEXT:  .LBB3_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_38
+; NONEON-NOSVE-NEXT:  .LBB3_7: // %else11
+; NONEON-NOSVE-NEXT:    add x9, sp, #1488
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_39
+; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_40
+; NONEON-NOSVE-NEXT:  .LBB3_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_41
+; NONEON-NOSVE-NEXT:  .LBB3_10: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_42
+; NONEON-NOSVE-NEXT:  .LBB3_11: // %else23
+; NONEON-NOSVE-NEXT:    add x9, sp, #1232
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_43
+; NONEON-NOSVE-NEXT:  .LBB3_12: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_44
+; NONEON-NOSVE-NEXT:  .LBB3_13: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_45
+; NONEON-NOSVE-NEXT:  .LBB3_14: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_46
+; NONEON-NOSVE-NEXT:  .LBB3_15: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_47
+; NONEON-NOSVE-NEXT:  .LBB3_16: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_48
+; NONEON-NOSVE-NEXT:  .LBB3_17: // %else41
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_49
+; NONEON-NOSVE-NEXT:  .LBB3_18: // %else44
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_50
+; NONEON-NOSVE-NEXT:  .LBB3_19: // %else47
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_51
+; NONEON-NOSVE-NEXT:  .LBB3_20: // %else50
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_52
+; NONEON-NOSVE-NEXT:  .LBB3_21: // %else53
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_53
+; NONEON-NOSVE-NEXT:  .LBB3_22: // %else56
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_54
+; NONEON-NOSVE-NEXT:  .LBB3_23: // %else59
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_55
+; NONEON-NOSVE-NEXT:  .LBB3_24: // %else62
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_56
+; NONEON-NOSVE-NEXT:  .LBB3_25: // %else65
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_57
+; NONEON-NOSVE-NEXT:  .LBB3_26: // %else68
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_58
+; NONEON-NOSVE-NEXT:  .LBB3_27: // %else71
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_59
+; NONEON-NOSVE-NEXT:  .LBB3_28: // %else74
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_60
+; NONEON-NOSVE-NEXT:  .LBB3_29: // %else77
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_61
+; NONEON-NOSVE-NEXT:  .LBB3_30: // %else80
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_62
+; NONEON-NOSVE-NEXT:  .LBB3_31: // %else83
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_63
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %else86
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_64
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %else89
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_35
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.load91
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #31]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %else92
+; NONEON-NOSVE-NEXT:    add sp, sp, #2064
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1872]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1904]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1887]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1904]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1903]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1856]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #131]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #147]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1872]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1888]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1856]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1890]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1888]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_6
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #3]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1808]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1840]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1820]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1840]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1836]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1792]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #68]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #84]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1810]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1826]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1808]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1824]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1792]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1827]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1824]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_7
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1744]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1776]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1759]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1776]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1775]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1728]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #5]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #21]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1744]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1760]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1728]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1764]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1760]
+; NONEON-NOSVE-NEXT:    add x9, sp, #1488
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_8
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #5]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1680]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1712]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1694]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1712]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1710]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1664]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #198]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #214]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1684]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1700]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1680]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1696]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1664]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1701]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1696]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_9
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1616]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1648]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1631]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1648]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1647]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1600]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #135]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #151]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1620]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1636]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1616]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1632]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1600]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1638]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1632]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_10
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #7]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1552]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1584]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1560]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1584]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1576]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1558]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1536]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1574]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1556]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1572]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1552]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1568]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1536]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1575]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1568]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_11
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.load22
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1488]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1520]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1503]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1520]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1519]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1472]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x9, #9]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur w11, [x9, #25]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1488]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1504]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1472]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1512]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1504]
+; NONEON-NOSVE-NEXT:    add x9, sp, #1232
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_12
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.load25
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #9]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1424]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1456]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1438]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1456]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1454]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1408]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1432]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1448]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1424]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1440]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1408]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1449]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1440]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_13
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.load28
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1360]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1392]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1375]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1392]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1391]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1344]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1368]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1384]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1360]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1376]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1344]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1386]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1376]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_14
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.load31
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #11]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1296]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1328]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #1308]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1328]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #1324]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1306]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1280]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1322]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1304]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #1320]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #1296]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #1312]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1280]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1323]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1312]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_15
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.load34
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1232]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1264]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1247]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1264]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #1263]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1216]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1240]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1256]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1232]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1248]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1216]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1260]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1248]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_16
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.load37
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #13]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1168]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1200]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1182]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1200]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #1198]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1180]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1152]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1196]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1176]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1192]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1168]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1184]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1152]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1197]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1184]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_17
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.load40
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1104]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1136]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1119]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1135]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1116]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1088]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #1132]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1128]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1104]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1120]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1088]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1134]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1120]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_18
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.load43
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #15]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1024]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1072]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1038]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1072]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1070]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #1036]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1040]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #1068]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #1032]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #1064]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #1024]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #1056]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1040]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1071]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #1056]
+; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_19
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.load46
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #976]
+; NONEON-NOSVE-NEXT:    add x10, sp, #976
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #991]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #1008]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #1007]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #960]
+; NONEON-NOSVE-NEXT:    ldurh w9, [x10, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x10, #9]
+; NONEON-NOSVE-NEXT:    sturh w9, [x10, #29]
+; NONEON-NOSVE-NEXT:    ldur x9, [x10, #1]
+; NONEON-NOSVE-NEXT:    stur w11, [x10, #25]
+; NONEON-NOSVE-NEXT:    stur x9, [x10, #17]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #960]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #992]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #992]
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_20
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.load49
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #17]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #944]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #926]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #944]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #942]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #896]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #194]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #210]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #912]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #928]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #896]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #929]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #928]
+; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_21
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.load52
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #18]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #848]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #880]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #863]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #880]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #879]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #832]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #131]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #147]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #848]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #864]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #832]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #866]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #864]
+; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_22
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.load55
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #19]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #784]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #816]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #796]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #816]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #812]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #68]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #84]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #786]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #802]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #784]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #800]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #768]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #803]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #800]
+; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_23
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.load58
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #20]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #720]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #752]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #735]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #752]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #751]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #5]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #21]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #720]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #704]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #740]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #736]
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_24
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.load61
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #21]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #656]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #670]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #688]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #686]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #198]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #214]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #660]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #676]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #656]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #640]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #677]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #672]
+; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_25
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.load64
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #22]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #592]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #607]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #624]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #623]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #135]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #151]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #596]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #612]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #592]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #576]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #614]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #608]
+; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_26
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.load67
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #23]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #528]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #536]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #560]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #534]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #532]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #548]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #528]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #544]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #512]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #551]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #544]
+; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_27
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.load70
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #464]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #479]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #496]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #495]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    ldur w11, [x9, #9]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    stur w11, [x9, #25]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #464]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #448]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #488]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #480]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_28
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.load73
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #25]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #400]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #414]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #432]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #430]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #202]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #218]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #408]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #400]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #384]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #425]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #416]
+; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_29
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.load76
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #26]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #336]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #351]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #368]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #367]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #139]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #155]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #344]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #336]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #320]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #362]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_30
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.load79
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #27]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #282]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #280]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #299]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_31
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.load82
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #28]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #223]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #239]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldurh w10, [x9, #13]
+; NONEON-NOSVE-NEXT:    sturh w10, [x9, #29]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_32
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.load85
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #29]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #176]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #173]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_33
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.load88
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #30]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #111]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_34
+; NONEON-NOSVE-NEXT:    b .LBB3_35
   %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %src, i32 8, <32 x i1> %mask, <32 x i8> zeroinitializer)
   ret <32 x i8> %load
 }
@@ -155,6 +1639,40 @@ define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB4_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h1, [x0]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #24]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_3
+; NONEON-NOSVE-NEXT:    b .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI4_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI4_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:  .LBB4_4: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x half> @llvm.masked.load.v2f16(ptr %src, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
   ret <2 x half> %load
 }
@@ -170,6 +1688,88 @@ define <4 x half> @masked_load_v4f16(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    str d0, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #116]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #118]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #112]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB5_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h1, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #106]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #104]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #104]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_3
+; NONEON-NOSVE-NEXT:    b .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI5_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #96]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #88]
+; NONEON-NOSVE-NEXT:  .LBB5_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB5_7
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
+; NONEON-NOSVE-NEXT:  .LBB5_6: // %else8
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB5_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_6
+; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #6]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x half> @llvm.masked.load.v4f16(ptr %src, i32 8, <4 x i1> %mask, <4 x half> zeroinitializer)
   ret <4 x half> %load
 }
@@ -186,6 +1786,187 @@ define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #496
+; NONEON-NOSVE-NEXT:    str x29, [sp, #480] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 496
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #464]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #467]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #465]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #469]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #470]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #464]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #471]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w11, w8
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
+; NONEON-NOSVE-NEXT:    orr w8, w8, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x80
+; NONEON-NOSVE-NEXT:    add w10, w8, w10
+; NONEON-NOSVE-NEXT:    and w8, w10, #0xff
+; NONEON-NOSVE-NEXT:    tbz w10, #0, .LBB6_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h1, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #250]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #242]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #448]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #462]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_3
+; NONEON-NOSVE-NEXT:    b .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_2:
+; NONEON-NOSVE-NEXT:    adrp x10, .LCPI6_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x10, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #412]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #428]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #196]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #212]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #416]
+; NONEON-NOSVE-NEXT:  .LBB6_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_12
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_13
+; NONEON-NOSVE-NEXT:  .LBB6_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_14
+; NONEON-NOSVE-NEXT:  .LBB6_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_15
+; NONEON-NOSVE-NEXT:  .LBB6_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_16
+; NONEON-NOSVE-NEXT:  .LBB6_9: // %else17
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_11
+; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB6_11: // %else20
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #480] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #496
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #350]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #366]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #134]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #150]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #352]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_6
+; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #280]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #276]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #288]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #256]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_7
+; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #208]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #240]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #222]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #192]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_8
+; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #152]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_9
+; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldr h1, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_10
+; NONEON-NOSVE-NEXT:    b .LBB6_11
   %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer)
   ret <8 x half> %load
 }
@@ -210,6 +1991,386 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #1024
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 1040
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str q0, [sp, #976]
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI7_0
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #984]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1000]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #976]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #992]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #991]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1007]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #990]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1006]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #989]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1005]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #988]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1004]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #987]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1003]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #986]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1002]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #985]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1001]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #983]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #999]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #982]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #998]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #981]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #997]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #980]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #996]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #979]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #995]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #978]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #994]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #977]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #993]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #992]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    add x9, sp, #720
+; NONEON-NOSVE-NEXT:    str q0, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #1010]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #1008]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1012]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #1014]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #1016]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #1018]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #1020]
+; NONEON-NOSVE-NEXT:    add w8, w10, w8
+; NONEON-NOSVE-NEXT:    add w10, w11, w12
+; NONEON-NOSVE-NEXT:    add w11, w13, w14
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    add w10, w11, w15
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #1022]
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    add w8, w8, w11
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB7_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    ldr h2, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [x9, #250]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #242]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #960]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #974]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #960]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_3
+; NONEON-NOSVE-NEXT:    b .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #2]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #912]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #924]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #944]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #944]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #940]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #896]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #196]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #212]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #912]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #928]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #896]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #930]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #928]
+; NONEON-NOSVE-NEXT:  .LBB7_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_20
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_21
+; NONEON-NOSVE-NEXT:  .LBB7_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_22
+; NONEON-NOSVE-NEXT:  .LBB7_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_23
+; NONEON-NOSVE-NEXT:  .LBB7_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_24
+; NONEON-NOSVE-NEXT:  .LBB7_9: // %else17
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_25
+; NONEON-NOSVE-NEXT:  .LBB7_10: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_26
+; NONEON-NOSVE-NEXT:  .LBB7_11: // %else23
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_27
+; NONEON-NOSVE-NEXT:  .LBB7_12: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_28
+; NONEON-NOSVE-NEXT:  .LBB7_13: // %else29
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_29
+; NONEON-NOSVE-NEXT:  .LBB7_14: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_30
+; NONEON-NOSVE-NEXT:  .LBB7_15: // %else35
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_31
+; NONEON-NOSVE-NEXT:  .LBB7_16: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_32
+; NONEON-NOSVE-NEXT:  .LBB7_17: // %else41
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_19
+; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.load43
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #30]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB7_19: // %else44
+; NONEON-NOSVE-NEXT:    add sp, sp, #1024
+; NONEON-NOSVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #848]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #880]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #880]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #832]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #862]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #878]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #134]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #150]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #848]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #832]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #864]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #868]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #864]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_6
+; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #6]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #784]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #792]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #816]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #816]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #808]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #784]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #788]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #800]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #804]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #768]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #806]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #800]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_7
+; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #720]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #752]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #752]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #734]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #750]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #704]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #736]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #744]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #736]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_8
+; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #10]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #656]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #668]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #688]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #684]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #656]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #664]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #672]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #640]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #682]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #672]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_9
+; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #600]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #624]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #592]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #576]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #606]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #608]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #622]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #576]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #620]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #608]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_10
+; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #14]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #520]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #560]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #560]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #552]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #512]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #528]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #524]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #544]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #556]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #528]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #558]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #544]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_11
+; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.load22
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #464]
+; NONEON-NOSVE-NEXT:    add x9, sp, #464
+; NONEON-NOSVE-NEXT:    str h2, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #496]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #478]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #494]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    ldur x11, [x9, #2]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    stur x11, [x9, #18]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #448]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #480]
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_12
+; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.load25
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #18]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #412]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #432]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #428]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #196]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #212]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #400]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #384]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #416]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_13
+; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.load28
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #20]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #336]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #350]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #366]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #134]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #150]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #320]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #352]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_14
+; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.load31
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #22]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #280]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #304]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #276]
+; NONEON-NOSVE-NEXT:    str w10, [sp, #288]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #256]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_15
+; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.load34
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #240]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #222]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #238]
+; NONEON-NOSVE-NEXT:    ldur w10, [x9, #10]
+; NONEON-NOSVE-NEXT:    stur w10, [x9, #26]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #192]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #224]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_16
+; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.load37
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #26]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #176]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #144]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #152]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #160]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #128]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_17
+; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.load40
+; NONEON-NOSVE-NEXT:    ldr h2, [x0, #28]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    str h2, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp, #80]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #94]
+; NONEON-NOSVE-NEXT:    str x9, [sp, #96]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #110]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #64]
+; NONEON-NOSVE-NEXT:    str h1, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_18
+; NONEON-NOSVE-NEXT:    b .LBB7_19
   %load = call <16 x half> @llvm.masked.load.v16f16(ptr %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer)
   ret <16 x half> %load
 }
@@ -225,6 +2386,42 @@ define <2 x float> @masked_load_v2f32(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB8_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #44]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_3
+; NONEON-NOSVE-NEXT:    b .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI8_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:  .LBB8_4: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x float> @llvm.masked.load.v2f32(ptr %src, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
   ret <2 x float> %load
 }
@@ -241,6 +2438,84 @@ define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #224
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
+; NONEON-NOSVE-NEXT:    str d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #208]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB9_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #204]
+; NONEON-NOSVE-NEXT:    stur xzr, [sp, #196]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_3
+; NONEON-NOSVE-NEXT:    b .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI9_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x9, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:  .LBB9_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB9_7
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB9_8
+; NONEON-NOSVE-NEXT:  .LBB9_6: // %else8
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB9_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_6
+; NONEON-NOSVE-NEXT:  .LBB9_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr s1, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
   ret <4 x float> %load
 }
@@ -290,6 +2565,173 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #496
+; NONEON-NOSVE-NEXT:    str x29, [sp, #480] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 496
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #464]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #466]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #467]
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #465]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #469]
+; NONEON-NOSVE-NEXT:    ldrb w15, [sp, #470]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #464]
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w15, w15, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #471]
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x2
+; NONEON-NOSVE-NEXT:    and w13, w13, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w11, w12, #0, #1
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x20
+; NONEON-NOSVE-NEXT:    orr w9, w9, w13
+; NONEON-NOSVE-NEXT:    and w13, w15, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w11, w9
+; NONEON-NOSVE-NEXT:    orr w11, w12, w13
+; NONEON-NOSVE-NEXT:    ldr q1, [x8, :lo12:.LCPI10_0]
+; NONEON-NOSVE-NEXT:    orr w9, w9, w11
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x80
+; NONEON-NOSVE-NEXT:    add w10, w9, w10
+; NONEON-NOSVE-NEXT:    add x9, sp, #208
+; NONEON-NOSVE-NEXT:    and w8, w10, #0xff
+; NONEON-NOSVE-NEXT:    tbz w10, #0, .LBB10_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr s0, [x0]
+; NONEON-NOSVE-NEXT:    str wzr, [sp, #460]
+; NONEON-NOSVE-NEXT:    stur xzr, [x9, #244]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #448]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_3
+; NONEON-NOSVE-NEXT:    b .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #4]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #432]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #412]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #408]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #400]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #384]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #416]
+; NONEON-NOSVE-NEXT:  .LBB10_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB10_12
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB10_13
+; NONEON-NOSVE-NEXT:  .LBB10_6: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB10_14
+; NONEON-NOSVE-NEXT:  .LBB10_7: // %else11
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB10_15
+; NONEON-NOSVE-NEXT:  .LBB10_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB10_16
+; NONEON-NOSVE-NEXT:  .LBB10_9: // %else17
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB10_11
+; NONEON-NOSVE-NEXT:  .LBB10_10: // %cond.load19
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #28]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp s2, s1, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB10_11: // %else20
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #480] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #496
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB10_12: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #348]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #340]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #320]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #352]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB10_6
+; NONEON-NOSVE-NEXT:  .LBB10_13: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #12]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr x10, [sp, #256]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    str x10, [sp, #288]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #264]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #272]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB10_7
+; NONEON-NOSVE-NEXT:  .LBB10_14: // %cond.load10
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #240]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #220]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #236]
+; NONEON-NOSVE-NEXT:    ldur x10, [x9, #4]
+; NONEON-NOSVE-NEXT:    stur x10, [x9, #20]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #192]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #224]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB10_8
+; NONEON-NOSVE-NEXT:  .LBB10_15: // %cond.load13
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #20]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #144]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #176]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #152]
+; NONEON-NOSVE-NEXT:    stp s1, s2, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp s2, s1, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #160]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB10_9
+; NONEON-NOSVE-NEXT:  .LBB10_16: // %cond.load16
+; NONEON-NOSVE-NEXT:    ldr s2, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    str s2, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp s1, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp s1, s3, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp s1, s2, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB10_10
+; NONEON-NOSVE-NEXT:    b .LBB10_11
   %load = call <8 x float> @llvm.masked.load.v8f32(ptr %src, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
   ret <8 x float> %load
 }
@@ -306,6 +2748,42 @@ define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    str d0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB11_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #72]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_3
+; NONEON-NOSVE-NEXT:    b .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI11_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x9, :lo12:.LCPI11_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr d1, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp]
+; NONEON-NOSVE-NEXT:    str d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:  .LBB11_4: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer)
   ret <2 x double> %load
 }
@@ -331,6 +2809,78 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #224
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 224
+; NONEON-NOSVE-NEXT:    str d0, [sp, #208]
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI12_0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr q1, [x9, :lo12:.LCPI12_0]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x4
+; NONEON-NOSVE-NEXT:    and w11, w11, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w12, #0, #1
+; NONEON-NOSVE-NEXT:    orr w10, w10, w11
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB12_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #200]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB12_3
+; NONEON-NOSVE-NEXT:    b .LBB12_4
+; NONEON-NOSVE-NEXT:  .LBB12_2:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB12_4
+; NONEON-NOSVE-NEXT:  .LBB12_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldr d2, [x0, #8]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp d2, d0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #160]
+; NONEON-NOSVE-NEXT:  .LBB12_4: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB12_7
+; NONEON-NOSVE-NEXT:  // %bb.5: // %else5
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB12_8
+; NONEON-NOSVE-NEXT:  .LBB12_6: // %else8
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB12_7: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldr d2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr d2, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB12_6
+; NONEON-NOSVE-NEXT:  .LBB12_8: // %cond.load7
+; NONEON-NOSVE-NEXT:    ldr d2, [x0, #24]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    str d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d2, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #224
+; NONEON-NOSVE-NEXT:    ret
   %load = call <4 x double> @llvm.masked.load.v4f64(ptr %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer)
   ret <4 x double> %load
 }
@@ -356,6 +2906,55 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_zext_v3i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
+; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB13_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB13_3
+; NONEON-NOSVE-NEXT:    b .LBB13_4
+; NONEON-NOSVE-NEXT:  .LBB13_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI13_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI13_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB13_4
+; NONEON-NOSVE-NEXT:  .LBB13_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:  .LBB13_4: // %else2
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB13_6
+; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:  .LBB13_6: // %else5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = zext <3 x i16> %load_value to <3 x i32>
   ret <3 x i32> %extend;
@@ -382,6 +2981,55 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_load_sext_v3i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    bfi w8, w2, #1, #1
+; NONEON-NOSVE-NEXT:    bfi w8, w3, #2, #1
+; NONEON-NOSVE-NEXT:    tbz w8, #0, .LBB14_2
+; NONEON-NOSVE-NEXT:  // %bb.1: // %cond.load
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0]
+; NONEON-NOSVE-NEXT:    stur wzr, [sp, #66]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #64]
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB14_3
+; NONEON-NOSVE-NEXT:    b .LBB14_4
+; NONEON-NOSVE-NEXT:  .LBB14_2:
+; NONEON-NOSVE-NEXT:    adrp x9, .LCPI14_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x9, :lo12:.LCPI14_0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB14_4
+; NONEON-NOSVE-NEXT:  .LBB14_3: // %cond.load1
+; NONEON-NOSVE-NEXT:    ldrh w9, [x0, #2]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:  .LBB14_4: // %else2
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB14_6
+; NONEON-NOSVE-NEXT:  // %bb.5: // %cond.load4
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #4]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:  .LBB14_6: // %else5
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
   %extend = sext <3 x i16> %load_value to <3 x i32>
   ret <3 x i32> %extend;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index bd6b96889b4cc..a79ce9db9abfd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -19,6 +20,47 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB0_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB0_6
+; NONEON-NOSVE-NEXT:  .LBB0_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB0_7
+; NONEON-NOSVE-NEXT:  .LBB0_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB0_8
+; NONEON-NOSVE-NEXT:  .LBB0_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB0_5: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB0_2
+; NONEON-NOSVE-NEXT:  .LBB0_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB0_3
+; NONEON-NOSVE-NEXT:  .LBB0_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB0_4
+; NONEON-NOSVE-NEXT:  .LBB0_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4i8(<4 x i8> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -34,6 +76,84 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x80
+; NONEON-NOSVE-NEXT:    add w9, w8, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, #0xff
+; NONEON-NOSVE-NEXT:    tbnz w9, #0, .LBB1_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB1_10
+; NONEON-NOSVE-NEXT:  .LBB1_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB1_11
+; NONEON-NOSVE-NEXT:  .LBB1_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB1_12
+; NONEON-NOSVE-NEXT:  .LBB1_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB1_13
+; NONEON-NOSVE-NEXT:  .LBB1_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB1_14
+; NONEON-NOSVE-NEXT:  .LBB1_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB1_15
+; NONEON-NOSVE-NEXT:  .LBB1_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB1_16
+; NONEON-NOSVE-NEXT:  .LBB1_8: // %else14
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB1_9: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB1_2
+; NONEON-NOSVE-NEXT:  .LBB1_10: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB1_3
+; NONEON-NOSVE-NEXT:  .LBB1_11: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB1_4
+; NONEON-NOSVE-NEXT:  .LBB1_12: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB1_5
+; NONEON-NOSVE-NEXT:  .LBB1_13: // %cond.store7
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB1_6
+; NONEON-NOSVE-NEXT:  .LBB1_14: // %cond.store9
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB1_7
+; NONEON-NOSVE-NEXT:  .LBB1_15: // %cond.store11
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB1_8
+; NONEON-NOSVE-NEXT:  .LBB1_16: // %cond.store13
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8i8(<8 x i8> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -49,6 +169,175 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w12, w13
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w14
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB2_17
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB2_18
+; NONEON-NOSVE-NEXT:  .LBB2_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB2_19
+; NONEON-NOSVE-NEXT:  .LBB2_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB2_20
+; NONEON-NOSVE-NEXT:  .LBB2_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB2_21
+; NONEON-NOSVE-NEXT:  .LBB2_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB2_22
+; NONEON-NOSVE-NEXT:  .LBB2_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB2_23
+; NONEON-NOSVE-NEXT:  .LBB2_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB2_24
+; NONEON-NOSVE-NEXT:  .LBB2_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB2_25
+; NONEON-NOSVE-NEXT:  .LBB2_9: // %else16
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB2_26
+; NONEON-NOSVE-NEXT:  .LBB2_10: // %else18
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB2_27
+; NONEON-NOSVE-NEXT:  .LBB2_11: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB2_28
+; NONEON-NOSVE-NEXT:  .LBB2_12: // %else22
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB2_29
+; NONEON-NOSVE-NEXT:  .LBB2_13: // %else24
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB2_30
+; NONEON-NOSVE-NEXT:  .LBB2_14: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB2_31
+; NONEON-NOSVE-NEXT:  .LBB2_15: // %else28
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB2_32
+; NONEON-NOSVE-NEXT:  .LBB2_16: // %else30
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB2_17: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB2_2
+; NONEON-NOSVE-NEXT:  .LBB2_18: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB2_3
+; NONEON-NOSVE-NEXT:  .LBB2_19: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB2_4
+; NONEON-NOSVE-NEXT:  .LBB2_20: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB2_5
+; NONEON-NOSVE-NEXT:  .LBB2_21: // %cond.store7
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB2_6
+; NONEON-NOSVE-NEXT:  .LBB2_22: // %cond.store9
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB2_7
+; NONEON-NOSVE-NEXT:  .LBB2_23: // %cond.store11
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB2_8
+; NONEON-NOSVE-NEXT:  .LBB2_24: // %cond.store13
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB2_9
+; NONEON-NOSVE-NEXT:  .LBB2_25: // %cond.store15
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB2_10
+; NONEON-NOSVE-NEXT:  .LBB2_26: // %cond.store17
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB2_11
+; NONEON-NOSVE-NEXT:  .LBB2_27: // %cond.store19
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB2_12
+; NONEON-NOSVE-NEXT:  .LBB2_28: // %cond.store21
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB2_13
+; NONEON-NOSVE-NEXT:  .LBB2_29: // %cond.store23
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB2_14
+; NONEON-NOSVE-NEXT:  .LBB2_30: // %cond.store25
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB2_15
+; NONEON-NOSVE-NEXT:  .LBB2_31: // %cond.store27
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB2_16
+; NONEON-NOSVE-NEXT:  .LBB2_32: // %cond.store29
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
 }
@@ -129,6 +418,331 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    .cfi_offset w29, -16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #160]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #264]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #248]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #240]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #232]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #224]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #208]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #200]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #192]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x20
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w1, #0x1
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #104]
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #80]
+; NONEON-NOSVE-NEXT:    sbfx w11, w11, #0, #1
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #128]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #120]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x10
+; NONEON-NOSVE-NEXT:    zip1 v2.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    and w8, w11, #0x4
+; NONEON-NOSVE-NEXT:    sbfx w10, w12, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x2
+; NONEON-NOSVE-NEXT:    sbfx w9, w7, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x80
+; NONEON-NOSVE-NEXT:    sbfx w10, w6, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w5, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x20
+; NONEON-NOSVE-NEXT:    sbfx w10, w4, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x10
+; NONEON-NOSVE-NEXT:    sbfx w9, w3, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w10, w2, #0, #1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    and w8, w9, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    and w8, w10, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w12, w13
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w9, w9, w10
+; NONEON-NOSVE-NEXT:    add w10, w12, w11
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w12, w13, w14
+; NONEON-NOSVE-NEXT:    add w14, w15, w16
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w10, w10, w12
+; NONEON-NOSVE-NEXT:    add w11, w14, w11
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w8, w13
+; NONEON-NOSVE-NEXT:    add w8, w9, w12
+; NONEON-NOSVE-NEXT:    bfi w8, w10, #16, #16
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB3_34
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB3_35
+; NONEON-NOSVE-NEXT:  .LBB3_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB3_36
+; NONEON-NOSVE-NEXT:  .LBB3_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB3_37
+; NONEON-NOSVE-NEXT:  .LBB3_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB3_38
+; NONEON-NOSVE-NEXT:  .LBB3_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB3_39
+; NONEON-NOSVE-NEXT:  .LBB3_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB3_40
+; NONEON-NOSVE-NEXT:  .LBB3_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB3_41
+; NONEON-NOSVE-NEXT:  .LBB3_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB3_42
+; NONEON-NOSVE-NEXT:  .LBB3_9: // %else16
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB3_43
+; NONEON-NOSVE-NEXT:  .LBB3_10: // %else18
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB3_44
+; NONEON-NOSVE-NEXT:  .LBB3_11: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB3_45
+; NONEON-NOSVE-NEXT:  .LBB3_12: // %else22
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB3_46
+; NONEON-NOSVE-NEXT:  .LBB3_13: // %else24
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB3_47
+; NONEON-NOSVE-NEXT:  .LBB3_14: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB3_48
+; NONEON-NOSVE-NEXT:  .LBB3_15: // %else28
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB3_49
+; NONEON-NOSVE-NEXT:  .LBB3_16: // %else30
+; NONEON-NOSVE-NEXT:    tbnz w8, #16, .LBB3_50
+; NONEON-NOSVE-NEXT:  .LBB3_17: // %else32
+; NONEON-NOSVE-NEXT:    tbnz w8, #17, .LBB3_51
+; NONEON-NOSVE-NEXT:  .LBB3_18: // %else34
+; NONEON-NOSVE-NEXT:    tbnz w8, #18, .LBB3_52
+; NONEON-NOSVE-NEXT:  .LBB3_19: // %else36
+; NONEON-NOSVE-NEXT:    tbnz w8, #19, .LBB3_53
+; NONEON-NOSVE-NEXT:  .LBB3_20: // %else38
+; NONEON-NOSVE-NEXT:    tbnz w8, #20, .LBB3_54
+; NONEON-NOSVE-NEXT:  .LBB3_21: // %else40
+; NONEON-NOSVE-NEXT:    tbnz w8, #21, .LBB3_55
+; NONEON-NOSVE-NEXT:  .LBB3_22: // %else42
+; NONEON-NOSVE-NEXT:    tbnz w8, #22, .LBB3_56
+; NONEON-NOSVE-NEXT:  .LBB3_23: // %else44
+; NONEON-NOSVE-NEXT:    tbnz w8, #23, .LBB3_57
+; NONEON-NOSVE-NEXT:  .LBB3_24: // %else46
+; NONEON-NOSVE-NEXT:    tbnz w8, #24, .LBB3_58
+; NONEON-NOSVE-NEXT:  .LBB3_25: // %else48
+; NONEON-NOSVE-NEXT:    tbnz w8, #25, .LBB3_59
+; NONEON-NOSVE-NEXT:  .LBB3_26: // %else50
+; NONEON-NOSVE-NEXT:    tbnz w8, #26, .LBB3_60
+; NONEON-NOSVE-NEXT:  .LBB3_27: // %else52
+; NONEON-NOSVE-NEXT:    tbnz w8, #27, .LBB3_61
+; NONEON-NOSVE-NEXT:  .LBB3_28: // %else54
+; NONEON-NOSVE-NEXT:    tbnz w8, #28, .LBB3_62
+; NONEON-NOSVE-NEXT:  .LBB3_29: // %else56
+; NONEON-NOSVE-NEXT:    tbnz w8, #29, .LBB3_63
+; NONEON-NOSVE-NEXT:  .LBB3_30: // %else58
+; NONEON-NOSVE-NEXT:    tbnz w8, #30, .LBB3_64
+; NONEON-NOSVE-NEXT:  .LBB3_31: // %else60
+; NONEON-NOSVE-NEXT:    tbz w8, #31, .LBB3_33
+; NONEON-NOSVE-NEXT:  .LBB3_32: // %cond.store61
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #31]
+; NONEON-NOSVE-NEXT:  .LBB3_33: // %else62
+; NONEON-NOSVE-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB3_34: // %cond.store
+; NONEON-NOSVE-NEXT:    strb wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB3_2
+; NONEON-NOSVE-NEXT:  .LBB3_35: // %cond.store1
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #1]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB3_3
+; NONEON-NOSVE-NEXT:  .LBB3_36: // %cond.store3
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB3_4
+; NONEON-NOSVE-NEXT:  .LBB3_37: // %cond.store5
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #3]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB3_5
+; NONEON-NOSVE-NEXT:  .LBB3_38: // %cond.store7
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB3_6
+; NONEON-NOSVE-NEXT:  .LBB3_39: // %cond.store9
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #5]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB3_7
+; NONEON-NOSVE-NEXT:  .LBB3_40: // %cond.store11
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB3_8
+; NONEON-NOSVE-NEXT:  .LBB3_41: // %cond.store13
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #7]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB3_9
+; NONEON-NOSVE-NEXT:  .LBB3_42: // %cond.store15
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB3_10
+; NONEON-NOSVE-NEXT:  .LBB3_43: // %cond.store17
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #9]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB3_11
+; NONEON-NOSVE-NEXT:  .LBB3_44: // %cond.store19
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB3_12
+; NONEON-NOSVE-NEXT:  .LBB3_45: // %cond.store21
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #11]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB3_13
+; NONEON-NOSVE-NEXT:  .LBB3_46: // %cond.store23
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB3_14
+; NONEON-NOSVE-NEXT:  .LBB3_47: // %cond.store25
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #13]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB3_15
+; NONEON-NOSVE-NEXT:  .LBB3_48: // %cond.store27
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #14]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB3_16
+; NONEON-NOSVE-NEXT:  .LBB3_49: // %cond.store29
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #15]
+; NONEON-NOSVE-NEXT:    tbz w8, #16, .LBB3_17
+; NONEON-NOSVE-NEXT:  .LBB3_50: // %cond.store31
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #17, .LBB3_18
+; NONEON-NOSVE-NEXT:  .LBB3_51: // %cond.store33
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #17]
+; NONEON-NOSVE-NEXT:    tbz w8, #18, .LBB3_19
+; NONEON-NOSVE-NEXT:  .LBB3_52: // %cond.store35
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #18]
+; NONEON-NOSVE-NEXT:    tbz w8, #19, .LBB3_20
+; NONEON-NOSVE-NEXT:  .LBB3_53: // %cond.store37
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #19]
+; NONEON-NOSVE-NEXT:    tbz w8, #20, .LBB3_21
+; NONEON-NOSVE-NEXT:  .LBB3_54: // %cond.store39
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #20]
+; NONEON-NOSVE-NEXT:    tbz w8, #21, .LBB3_22
+; NONEON-NOSVE-NEXT:  .LBB3_55: // %cond.store41
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #21]
+; NONEON-NOSVE-NEXT:    tbz w8, #22, .LBB3_23
+; NONEON-NOSVE-NEXT:  .LBB3_56: // %cond.store43
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #22]
+; NONEON-NOSVE-NEXT:    tbz w8, #23, .LBB3_24
+; NONEON-NOSVE-NEXT:  .LBB3_57: // %cond.store45
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #23]
+; NONEON-NOSVE-NEXT:    tbz w8, #24, .LBB3_25
+; NONEON-NOSVE-NEXT:  .LBB3_58: // %cond.store47
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    tbz w8, #25, .LBB3_26
+; NONEON-NOSVE-NEXT:  .LBB3_59: // %cond.store49
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #25]
+; NONEON-NOSVE-NEXT:    tbz w8, #26, .LBB3_27
+; NONEON-NOSVE-NEXT:  .LBB3_60: // %cond.store51
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #26]
+; NONEON-NOSVE-NEXT:    tbz w8, #27, .LBB3_28
+; NONEON-NOSVE-NEXT:  .LBB3_61: // %cond.store53
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #27]
+; NONEON-NOSVE-NEXT:    tbz w8, #28, .LBB3_29
+; NONEON-NOSVE-NEXT:  .LBB3_62: // %cond.store55
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #28]
+; NONEON-NOSVE-NEXT:    tbz w8, #29, .LBB3_30
+; NONEON-NOSVE-NEXT:  .LBB3_63: // %cond.store57
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #29]
+; NONEON-NOSVE-NEXT:    tbz w8, #30, .LBB3_31
+; NONEON-NOSVE-NEXT:  .LBB3_64: // %cond.store59
+; NONEON-NOSVE-NEXT:    strb wzr, [x0, #30]
+; NONEON-NOSVE-NEXT:    tbnz w8, #31, .LBB3_32
+; NONEON-NOSVE-NEXT:    b .LBB3_33
   call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
   ret void
 }
@@ -154,6 +768,31 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB4_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB4_4
+; NONEON-NOSVE-NEXT:  .LBB4_2: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB4_3: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB4_2
+; NONEON-NOSVE-NEXT:  .LBB4_4: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f16(<2 x half> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
 }
@@ -169,6 +808,51 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB5_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB5_6
+; NONEON-NOSVE-NEXT:  .LBB5_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB5_7
+; NONEON-NOSVE-NEXT:  .LBB5_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB5_8
+; NONEON-NOSVE-NEXT:  .LBB5_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB5_5: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB5_2
+; NONEON-NOSVE-NEXT:  .LBB5_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB5_3
+; NONEON-NOSVE-NEXT:  .LBB5_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB5_4
+; NONEON-NOSVE-NEXT:  .LBB5_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f16(<4 x half> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -185,6 +869,92 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x80
+; NONEON-NOSVE-NEXT:    add w9, w8, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, #0xff
+; NONEON-NOSVE-NEXT:    tbnz w9, #0, .LBB6_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB6_10
+; NONEON-NOSVE-NEXT:  .LBB6_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB6_11
+; NONEON-NOSVE-NEXT:  .LBB6_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB6_12
+; NONEON-NOSVE-NEXT:  .LBB6_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB6_13
+; NONEON-NOSVE-NEXT:  .LBB6_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB6_14
+; NONEON-NOSVE-NEXT:  .LBB6_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB6_15
+; NONEON-NOSVE-NEXT:  .LBB6_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB6_16
+; NONEON-NOSVE-NEXT:  .LBB6_8: // %else14
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB6_9: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB6_2
+; NONEON-NOSVE-NEXT:  .LBB6_10: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB6_3
+; NONEON-NOSVE-NEXT:  .LBB6_11: // %cond.store3
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB6_4
+; NONEON-NOSVE-NEXT:  .LBB6_12: // %cond.store5
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB6_5
+; NONEON-NOSVE-NEXT:  .LBB6_13: // %cond.store7
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB6_6
+; NONEON-NOSVE-NEXT:  .LBB6_14: // %cond.store9
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB6_7
+; NONEON-NOSVE-NEXT:  .LBB6_15: // %cond.store11
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB6_8
+; NONEON-NOSVE-NEXT:  .LBB6_16: // %cond.store13
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #14]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -209,6 +979,191 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; CHECK-NEXT:    st1h { z1.h }, p1, [x0, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x80
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x40
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; NONEON-NOSVE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    add w10, w12, w13
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w14
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB7_17
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB7_18
+; NONEON-NOSVE-NEXT:  .LBB7_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB7_19
+; NONEON-NOSVE-NEXT:  .LBB7_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB7_20
+; NONEON-NOSVE-NEXT:  .LBB7_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB7_21
+; NONEON-NOSVE-NEXT:  .LBB7_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB7_22
+; NONEON-NOSVE-NEXT:  .LBB7_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB7_23
+; NONEON-NOSVE-NEXT:  .LBB7_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB7_24
+; NONEON-NOSVE-NEXT:  .LBB7_8: // %else14
+; NONEON-NOSVE-NEXT:    tbnz w8, #8, .LBB7_25
+; NONEON-NOSVE-NEXT:  .LBB7_9: // %else16
+; NONEON-NOSVE-NEXT:    tbnz w8, #9, .LBB7_26
+; NONEON-NOSVE-NEXT:  .LBB7_10: // %else18
+; NONEON-NOSVE-NEXT:    tbnz w8, #10, .LBB7_27
+; NONEON-NOSVE-NEXT:  .LBB7_11: // %else20
+; NONEON-NOSVE-NEXT:    tbnz w8, #11, .LBB7_28
+; NONEON-NOSVE-NEXT:  .LBB7_12: // %else22
+; NONEON-NOSVE-NEXT:    tbnz w8, #12, .LBB7_29
+; NONEON-NOSVE-NEXT:  .LBB7_13: // %else24
+; NONEON-NOSVE-NEXT:    tbnz w8, #13, .LBB7_30
+; NONEON-NOSVE-NEXT:  .LBB7_14: // %else26
+; NONEON-NOSVE-NEXT:    tbnz w8, #14, .LBB7_31
+; NONEON-NOSVE-NEXT:  .LBB7_15: // %else28
+; NONEON-NOSVE-NEXT:    tbnz w8, #15, .LBB7_32
+; NONEON-NOSVE-NEXT:  .LBB7_16: // %else30
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB7_17: // %cond.store
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB7_2
+; NONEON-NOSVE-NEXT:  .LBB7_18: // %cond.store1
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #2]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB7_3
+; NONEON-NOSVE-NEXT:  .LBB7_19: // %cond.store3
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB7_4
+; NONEON-NOSVE-NEXT:  .LBB7_20: // %cond.store5
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #6]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB7_5
+; NONEON-NOSVE-NEXT:  .LBB7_21: // %cond.store7
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB7_6
+; NONEON-NOSVE-NEXT:  .LBB7_22: // %cond.store9
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #10]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB7_7
+; NONEON-NOSVE-NEXT:  .LBB7_23: // %cond.store11
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB7_8
+; NONEON-NOSVE-NEXT:  .LBB7_24: // %cond.store13
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #14]
+; NONEON-NOSVE-NEXT:    tbz w8, #8, .LBB7_9
+; NONEON-NOSVE-NEXT:  .LBB7_25: // %cond.store15
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #9, .LBB7_10
+; NONEON-NOSVE-NEXT:  .LBB7_26: // %cond.store17
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #18]
+; NONEON-NOSVE-NEXT:    tbz w8, #10, .LBB7_11
+; NONEON-NOSVE-NEXT:  .LBB7_27: // %cond.store19
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #20]
+; NONEON-NOSVE-NEXT:    tbz w8, #11, .LBB7_12
+; NONEON-NOSVE-NEXT:  .LBB7_28: // %cond.store21
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #22]
+; NONEON-NOSVE-NEXT:    tbz w8, #12, .LBB7_13
+; NONEON-NOSVE-NEXT:  .LBB7_29: // %cond.store23
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #24]
+; NONEON-NOSVE-NEXT:    tbz w8, #13, .LBB7_14
+; NONEON-NOSVE-NEXT:  .LBB7_30: // %cond.store25
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #26]
+; NONEON-NOSVE-NEXT:    tbz w8, #14, .LBB7_15
+; NONEON-NOSVE-NEXT:  .LBB7_31: // %cond.store27
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #28]
+; NONEON-NOSVE-NEXT:    tbz w8, #15, .LBB7_16
+; NONEON-NOSVE-NEXT:  .LBB7_32: // %cond.store29
+; NONEON-NOSVE-NEXT:    fmov s0, wzr
+; NONEON-NOSVE-NEXT:    str h0, [x0, #30]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
 }
@@ -225,6 +1180,47 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB8_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB8_6
+; NONEON-NOSVE-NEXT:  .LBB8_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB8_7
+; NONEON-NOSVE-NEXT:  .LBB8_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB8_8
+; NONEON-NOSVE-NEXT:  .LBB8_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB8_5: // %cond.store
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB8_2
+; NONEON-NOSVE-NEXT:  .LBB8_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB8_3
+; NONEON-NOSVE-NEXT:  .LBB8_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB8_4
+; NONEON-NOSVE-NEXT:  .LBB8_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
@@ -275,6 +1271,84 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w13, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w14, [sp, #6]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    ldrb w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w12, w12, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w13, w13, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x4
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x8
+; NONEON-NOSVE-NEXT:    sbfx w14, w14, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x2
+; NONEON-NOSVE-NEXT:    and w12, w12, #0x10
+; NONEON-NOSVE-NEXT:    bfxil w10, w11, #0, #1
+; NONEON-NOSVE-NEXT:    and w11, w13, #0x20
+; NONEON-NOSVE-NEXT:    orr w8, w8, w12
+; NONEON-NOSVE-NEXT:    and w12, w14, #0x40
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    orr w8, w10, w8
+; NONEON-NOSVE-NEXT:    orr w10, w11, w12
+; NONEON-NOSVE-NEXT:    orr w8, w8, w10
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x80
+; NONEON-NOSVE-NEXT:    add w9, w8, w9
+; NONEON-NOSVE-NEXT:    and w8, w9, #0xff
+; NONEON-NOSVE-NEXT:    tbnz w9, #0, .LBB9_9
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB9_10
+; NONEON-NOSVE-NEXT:  .LBB9_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB9_11
+; NONEON-NOSVE-NEXT:  .LBB9_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB9_12
+; NONEON-NOSVE-NEXT:  .LBB9_4: // %else6
+; NONEON-NOSVE-NEXT:    tbnz w8, #4, .LBB9_13
+; NONEON-NOSVE-NEXT:  .LBB9_5: // %else8
+; NONEON-NOSVE-NEXT:    tbnz w8, #5, .LBB9_14
+; NONEON-NOSVE-NEXT:  .LBB9_6: // %else10
+; NONEON-NOSVE-NEXT:    tbnz w8, #6, .LBB9_15
+; NONEON-NOSVE-NEXT:  .LBB9_7: // %else12
+; NONEON-NOSVE-NEXT:    tbnz w8, #7, .LBB9_16
+; NONEON-NOSVE-NEXT:  .LBB9_8: // %else14
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB9_9: // %cond.store
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB9_2
+; NONEON-NOSVE-NEXT:  .LBB9_10: // %cond.store1
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #4]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB9_3
+; NONEON-NOSVE-NEXT:  .LBB9_11: // %cond.store3
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB9_4
+; NONEON-NOSVE-NEXT:  .LBB9_12: // %cond.store5
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #12]
+; NONEON-NOSVE-NEXT:    tbz w8, #4, .LBB9_5
+; NONEON-NOSVE-NEXT:  .LBB9_13: // %cond.store7
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #5, .LBB9_6
+; NONEON-NOSVE-NEXT:  .LBB9_14: // %cond.store9
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #20]
+; NONEON-NOSVE-NEXT:    tbz w8, #6, .LBB9_7
+; NONEON-NOSVE-NEXT:  .LBB9_15: // %cond.store11
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    tbz w8, #7, .LBB9_8
+; NONEON-NOSVE-NEXT:  .LBB9_16: // %cond.store13
+; NONEON-NOSVE-NEXT:    str wzr, [x0, #28]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
 }
@@ -291,6 +1365,29 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    bfxil w8, w9, #0, #1
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB10_3
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB10_4
+; NONEON-NOSVE-NEXT:  .LBB10_2: // %else2
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB10_3: // %cond.store
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB10_2
+; NONEON-NOSVE-NEXT:  .LBB10_4: // %cond.store1
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
 }
@@ -315,6 +1412,47 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    st1d { z0.d }, p1, [x0, x8, lsl #3]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: masked_store_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp]
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #0, #1
+; NONEON-NOSVE-NEXT:    sbfx w10, w10, #0, #1
+; NONEON-NOSVE-NEXT:    and w8, w8, #0x2
+; NONEON-NOSVE-NEXT:    and w9, w9, #0x4
+; NONEON-NOSVE-NEXT:    and w10, w10, #0x8
+; NONEON-NOSVE-NEXT:    bfxil w8, w11, #0, #1
+; NONEON-NOSVE-NEXT:    orr w9, w9, w10
+; NONEON-NOSVE-NEXT:    orr w8, w8, w9
+; NONEON-NOSVE-NEXT:    tbnz w8, #0, .LBB11_5
+; NONEON-NOSVE-NEXT:  // %bb.1: // %else
+; NONEON-NOSVE-NEXT:    tbnz w8, #1, .LBB11_6
+; NONEON-NOSVE-NEXT:  .LBB11_2: // %else2
+; NONEON-NOSVE-NEXT:    tbnz w8, #2, .LBB11_7
+; NONEON-NOSVE-NEXT:  .LBB11_3: // %else4
+; NONEON-NOSVE-NEXT:    tbnz w8, #3, .LBB11_8
+; NONEON-NOSVE-NEXT:  .LBB11_4: // %else6
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
+; NONEON-NOSVE-NEXT:  .LBB11_5: // %cond.store
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    tbz w8, #1, .LBB11_2
+; NONEON-NOSVE-NEXT:  .LBB11_6: // %cond.store1
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #8]
+; NONEON-NOSVE-NEXT:    tbz w8, #2, .LBB11_3
+; NONEON-NOSVE-NEXT:  .LBB11_7: // %cond.store3
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #16]
+; NONEON-NOSVE-NEXT:    tbz w8, #3, .LBB11_4
+; NONEON-NOSVE-NEXT:  .LBB11_8: // %cond.store5
+; NONEON-NOSVE-NEXT:    str xzr, [x0, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   call void @llvm.masked.store.v4f64(<4 x double> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index aef446a90df65..dbdf5f2502999 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,26 @@ define void @add_v4i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x1, #3]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrb w11, [x0, #1]
+; NONEON-NOSVE-NEXT:    ldrb w12, [x1, #2]
+; NONEON-NOSVE-NEXT:    ldrb w13, [x0]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    ldrb w14, [x1, #1]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x1]
+; NONEON-NOSVE-NEXT:    add w10, w10, w12
+; NONEON-NOSVE-NEXT:    strb w8, [x0, #3]
+; NONEON-NOSVE-NEXT:    add w8, w11, w14
+; NONEON-NOSVE-NEXT:    add w9, w13, w9
+; NONEON-NOSVE-NEXT:    strb w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [x0, #1]
+; NONEON-NOSVE-NEXT:    strb w9, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i8>, ptr %a
   %op2 = load <4 x i8>, ptr %b
   %res = add <4 x i8> %op1, %op2
@@ -29,6 +50,50 @@ define void @add_v8i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %op2 = load <8 x i8>, ptr %b
   %res = add <8 x i8> %op1, %op2
@@ -44,6 +109,81 @@ define void @add_v16i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
   %res = add <16 x i8> %op1, %op2
@@ -60,6 +200,147 @@ define void @add_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #71]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %res = add <32 x i8> %op1, %op2
@@ -76,6 +357,18 @@ define void @add_v2i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    ldrh w9, [x1]
+; NONEON-NOSVE-NEXT:    ldrh w10, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrh w11, [x1, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    add w9, w10, w11
+; NONEON-NOSVE-NEXT:    strh w8, [x0]
+; NONEON-NOSVE-NEXT:    strh w9, [x0, #2]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i16>, ptr %a
   %op2 = load <2 x i16>, ptr %b
   %res = add <2 x i16> %op1, %op2
@@ -91,6 +384,34 @@ define void @add_v4i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %op2 = load <4 x i16>, ptr %b
   %res = add <4 x i16> %op1, %op2
@@ -106,6 +427,49 @@ define void @add_v8i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
   %res = add <8 x i16> %op1, %op2
@@ -122,6 +486,83 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    add z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: add_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %res = add <16 x i16> %op1, %op2
@@ -137,6 +578,23 @@ define void @abs_v2i32(ptr %a) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
   store <2 x i32> %res, ptr %a
@@ -151,6 +609,30 @@ define void @abs_v4i32(ptr %a) {
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
   store <4 x i32> %res, ptr %a
@@ -166,6 +648,44 @@ define void @abs_v8i32(ptr %a) {
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w9, w8, mi
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    cmp w8, #0
+; NONEON-NOSVE-NEXT:    cneg w8, w8, mi
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
   store <8 x i32> %res, ptr %a
@@ -180,6 +700,23 @@ define void @abs_v2i64(ptr %a) {
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
   store <2 x i64> %res, ptr %a
@@ -195,6 +732,30 @@ define void @abs_v4i64(ptr %a) {
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: abs_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x9, x8, mi
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    cmp x8, #0
+; NONEON-NOSVE-NEXT:    cneg x8, x8, mi
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
   store <4 x i64> %res, ptr %a
@@ -211,6 +772,36 @@ define void @fadd_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr w8, [x1]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    str w8, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x half>, ptr %a
   %op2 = load <2 x half>, ptr %b
   %res = fadd <2 x half> %op1, %op2
@@ -227,6 +818,46 @@ define void @fadd_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x half>, ptr %a
   %op2 = load <4 x half>, ptr %b
   %res = fadd <4 x half> %op1, %op2
@@ -243,6 +874,73 @@ define void @fadd_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
   %res = fadd <8 x half> %op1, %op2
@@ -261,6 +959,131 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #46]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #44]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #42]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #38]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #36]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #34]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #32]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #6]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #4]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp, #2]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldr h1, [sp]
+; NONEON-NOSVE-NEXT:    fcvt s1, h1
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %res = fadd <16 x half> %op1, %op2
@@ -277,6 +1100,24 @@ define void @fadd_v2f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr d0, [x1]
+; NONEON-NOSVE-NEXT:    ldr d1, [x0]
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x float>, ptr %a
   %op2 = load <2 x float>, ptr %b
   %res = fadd <2 x float> %op1, %op2
@@ -293,6 +1134,29 @@ define void @fadd_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
   %res = fadd <4 x float> %op1, %op2
@@ -311,6 +1175,43 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s2, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
+; NONEON-NOSVE-NEXT:    fadd s3, s2, s0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %res = fadd <8 x float> %op1, %op2
@@ -327,6 +1228,23 @@ define void @fadd_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
   %res = fadd <2 x double> %op1, %op2
@@ -345,6 +1263,31 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fadd_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d2, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd d3, d2, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %res = fadd <4 x double> %op1, %op2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index 6d91253caae58..8c23f5f9922da 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -15,6 +16,74 @@ define void @test_revbv16i16(ptr %a) {
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revbv16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 undef, i32 24, i32 27, i32 undef, i32 29, i32 28, i32 undef, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -31,6 +100,74 @@ define void @test_revbv8i32(ptr %a) {
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revbv8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -47,6 +184,74 @@ define void @test_revbv4i64(ptr %a) {
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revbv4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 31, i32 30, i32 29, i32 undef, i32 27, i32 undef, i32 undef, i32 undef>
   store <32 x i8> %tmp2, ptr %a
@@ -63,6 +268,38 @@ define void @test_revhv8i32(ptr %a) {
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   store <16 x i16> %tmp2, ptr %a
@@ -79,6 +316,38 @@ define void @test_revhv8f32(ptr %a) {
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x half>, ptr %a
   %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
   store <16 x half> %tmp2, ptr %a
@@ -95,6 +364,38 @@ define void @test_revhv4i64(ptr %a) {
 ; CHECK-NEXT:    revh z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
   store <16 x i16> %tmp2, ptr %a
@@ -111,6 +412,26 @@ define void @test_revwv4i64(ptr %a) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revwv4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   store <8 x i32> %tmp2, ptr %a
@@ -127,6 +448,26 @@ define void @test_revwv4f64(ptr %a) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revwv4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   store <8 x float> %tmp2, ptr %a
@@ -141,6 +482,47 @@ define <16 x i8> @test_revv16i8(ptr %a) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revv16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %a
   %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
   ret <16 x i8> %tmp2
@@ -156,6 +538,26 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revwv8i32v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -176,6 +578,62 @@ define void @test_revhv32i16(ptr %a) {
 ; CHECK-NEXT:    stp q0, q1, [x0, #32]
 ; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revhv32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr q3, [sp, #112]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldur w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    ror w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
   store <32 x i16> %tmp2, ptr %a
@@ -191,6 +649,22 @@ define void @test_rev_elts_fail(ptr %a) {
 ; CHECK-NEXT:    tbl z0.d, { z2.d }, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_rev_elts_fail:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x i64> %tmp2, ptr %a
@@ -208,6 +682,15 @@ define void @test_revdv4i64_sve2p1(ptr %a) #1 {
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revdv4i64_sve2p1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ptrue p0.d, vl2
+; NONEON-NOSVE-NEXT:    revd z0.q, p0/m, z0.q
+; NONEON-NOSVE-NEXT:    revd z1.q, p0/m, z1.q
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x i64> %tmp2, ptr %a
@@ -223,6 +706,15 @@ define void @test_revdv4f64_sve2p1(ptr %a) #1 {
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revdv4f64_sve2p1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ptrue p0.d
+; NONEON-NOSVE-NEXT:    revd z0.q, p0/m, z0.q
+; NONEON-NOSVE-NEXT:    revd z1.q, p0/m, z1.q
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   store <4 x double> %tmp2, ptr %a
@@ -238,6 +730,27 @@ define void @test_revv8i32(ptr %a) {
 ; CHECK-NEXT:    tbl z0.s, { z2.s }, z0.s
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_revv8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   store <8 x i32> %tmp2, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index 8808ad9a23d7c..bc6fdd1ecd5a7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -68,6 +69,86 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
@@ -196,6 +277,153 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 192
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x1]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q5, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    stp q6, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q7, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q4, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #126]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #124]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #122]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #118]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #116]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #114]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #112]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #110]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #160]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #106]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #102]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #100]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #98]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = load <32 x i16>, ptr %b
   %tmp3 = shufflevector <32 x i16> %tmp1, <32 x i16> %tmp2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
@@ -244,6 +472,54 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -276,6 +552,30 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -298,6 +598,32 @@ define void @zip_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d3, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fadd d0, d3, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -330,6 +656,33 @@ define void @zip_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %a
   %tmp2 = load <4 x i32>, ptr %b
   %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -351,6 +704,26 @@ define void @zip1_v8i32_undef(ptr %a) {
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip1_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w9, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w10, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load  volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
   store volatile <8 x i32> %tmp2, ptr %a
@@ -370,6 +743,135 @@ define void @trn_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.b, z1.b, z2.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #67]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
@@ -392,6 +894,36 @@ define void @trn_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #4]
+; NONEON-NOSVE-NEXT:    add w10, w9, w8
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w10, w11, w10
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #12]
+; NONEON-NOSVE-NEXT:    add w11, w10, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
   %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 7, i32 2, i32 6, i32 4, i32 5, i32 1, i32 3>
@@ -414,6 +946,83 @@ define void @trn_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.h, z1.h, z2.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -436,6 +1045,29 @@ define void @trn_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add z1.s, z1.s, z2.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #16]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
@@ -459,6 +1091,29 @@ define void @trn_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q3, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -479,6 +1134,27 @@ define void @trn_v4f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    fadd s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    fadd s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x float>, ptr %a
   %tmp2 = load <4 x float>, ptr %b
   %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -500,6 +1176,28 @@ define void @trn_v8i32_undef(ptr %a) {
 ; CHECK-NEXT:    add z1.s, z3.s, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trn_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -571,6 +1269,86 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
   %tmp2 = load volatile <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
@@ -617,6 +1395,54 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
   %tmp2 = load volatile <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -649,6 +1475,30 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w10, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w8, w11, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w9, w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w10, w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = load volatile <8 x i32>, ptr %b
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -668,6 +1518,26 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 ; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip2_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #48
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp]
+; NONEON-NOSVE-NEXT:    ldp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w9, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w10, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
   %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
   store volatile <8 x i32> %tmp2, ptr %a
@@ -869,6 +1739,135 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #95]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #91]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #50]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #89]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #87]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #85]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #83]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #81]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #75]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #69]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrb w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
   %tmp3 = shufflevector <32 x i8> %tmp1, <32 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
@@ -891,6 +1890,26 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    add w9, w9, w8
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %a
   %tmp2 = load <4 x i16>, ptr %b
   %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
@@ -1008,6 +2027,83 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #60]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #52]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
   %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1047,6 +2143,35 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q2, q1, [x0]
+; NONEON-NOSVE-NEXT:    mov x8, #9205357640488583168 // =0x7fc000007fc00000
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #56]
+; NONEON-NOSVE-NEXT:    mov w8, #2143289344 // =0x7fc00000
+; NONEON-NOSVE-NEXT:    str w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    str s0, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fadd s2, s1, s0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    stp s0, s2, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    fadd s0, s1, s0
+; NONEON-NOSVE-NEXT:    str s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x float>, ptr %a
   %tmp2 = load <8 x float>, ptr %b
   %tmp3 = shufflevector <8 x float> %tmp1, <8 x float> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 6, i32 undef, i32 10, i32 12, i32 14>
@@ -1069,6 +2194,31 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    add x8, x9, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
   %tmp2 = load <4 x i64>, ptr %b
   %tmp3 = shufflevector <4 x i64> %tmp1, <4 x i64> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1136,6 +2286,49 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #28]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %a
   %tmp2 = load <8 x i16>, ptr %b
   %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1174,6 +2367,28 @@ define void @uzp_v8i32_undef(ptr %a) #0{
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: uzp_v8i32_undef:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    add w8, w9, w8
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6>
   %tmp4 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 1, i32 3, i32 5, i32 7>
@@ -1197,6 +2412,32 @@ define void @zip_vscale2_4(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: zip_vscale2_4:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 96
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q3, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #56]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldp d3, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fadd d0, d3, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #48]
+; NONEON-NOSVE-NEXT:    fadd d2, d1, d0
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    fadd d0, d1, d0
+; NONEON-NOSVE-NEXT:    stp d0, d2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
   %tmp3 = shufflevector <4 x double> %tmp1, <4 x double> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index 8039bd096bcb8..8ebf713a671f4 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -35,6 +36,80 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ptest_v16i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    mov w8, #255 // =0xff
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #40]
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w9, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp]
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #32]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #48]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    cmp w11, w10
+; NONEON-NOSVE-NEXT:    csel w10, w11, w10, hi
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    cmp w10, w9
+; NONEON-NOSVE-NEXT:    csel w9, w10, w9, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w10
+; NONEON-NOSVE-NEXT:    csel w9, w9, w10, hi
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csel w8, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
@@ -92,6 +167,148 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ptest_or_v16i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    csinv w8, w8, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w9, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    csinv w9, w9, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csinv w10, w10, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    csinv w11, w11, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp]
+; NONEON-NOSVE-NEXT:    orr w10, w11, w10
+; NONEON-NOSVE-NEXT:    csetm w12, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w12, w12, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csinv w14, w13, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    orr w12, w14, w12
+; NONEON-NOSVE-NEXT:    orr w10, w12, w10
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    orr w9, w10, w9
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    csinv w13, w13, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    csetm w15, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #112]
+; NONEON-NOSVE-NEXT:    csinv w15, w15, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    and w11, w15, #0xff
+; NONEON-NOSVE-NEXT:    csetm w16, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #68]
+; NONEON-NOSVE-NEXT:    csinv w16, w16, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w17, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #120]
+; NONEON-NOSVE-NEXT:    csinv w17, w17, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    csetm w18, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #48]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csinv w18, w18, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w0, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csinv w0, w0, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csetm w1, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csinv w1, w1, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w2, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    csinv w2, w2, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w3, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csinv w3, w3, wzr, eq
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w4, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csinv w10, w4, wzr, eq
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, hi
+; NONEON-NOSVE-NEXT:    and w9, w13, #0xff
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w16, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    and w11, w17, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w18, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    and w11, w0, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w1, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    and w11, w2, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    and w9, w3, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, hi
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, hi
+; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
@@ -159,6 +376,148 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: ptest_and_v16i1:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    str q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #48]
+; NONEON-NOSVE-NEXT:    str q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp, #96]
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
+; NONEON-NOSVE-NEXT:    csetm w8, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w8, w8, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #12]
+; NONEON-NOSVE-NEXT:    csetm w9, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    csel w9, w9, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    csetm w10, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w10, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csetm w11, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s1, s0, [sp, #16]
+; NONEON-NOSVE-NEXT:    csel w11, w11, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s2, s0, [sp]
+; NONEON-NOSVE-NEXT:    and w10, w11, w10
+; NONEON-NOSVE-NEXT:    csetm w12, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w12, w12, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #104]
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csel w14, w13, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #56]
+; NONEON-NOSVE-NEXT:    and w12, w14, w12
+; NONEON-NOSVE-NEXT:    and w10, w12, w10
+; NONEON-NOSVE-NEXT:    csetm w13, ne
+; NONEON-NOSVE-NEXT:    and w9, w10, w9
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr q0, [x1, #32]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #112]
+; NONEON-NOSVE-NEXT:    csel w13, w13, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr s1, [sp, #64]
+; NONEON-NOSVE-NEXT:    csetm w15, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #112]
+; NONEON-NOSVE-NEXT:    csel w15, w15, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    and w11, w15, #0xff
+; NONEON-NOSVE-NEXT:    csetm w16, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #68]
+; NONEON-NOSVE-NEXT:    csel w16, w16, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w17, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #120]
+; NONEON-NOSVE-NEXT:    csel w17, w17, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    csetm w18, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldr q1, [x1, #48]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel w18, w18, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    ldr s2, [sp, #32]
+; NONEON-NOSVE-NEXT:    csetm w0, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #80]
+; NONEON-NOSVE-NEXT:    csel w0, w0, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csetm w1, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csel w1, w1, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #36]
+; NONEON-NOSVE-NEXT:    csetm w2, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldp s0, s2, [sp, #88]
+; NONEON-NOSVE-NEXT:    csel w2, w2, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    csetm w3, ne
+; NONEON-NOSVE-NEXT:    fcmp s1, #0.0
+; NONEON-NOSVE-NEXT:    csel w3, w3, wzr, ne
+; NONEON-NOSVE-NEXT:    fcmp s2, #0.0
+; NONEON-NOSVE-NEXT:    csetm w4, ne
+; NONEON-NOSVE-NEXT:    fcmp s0, #0.0
+; NONEON-NOSVE-NEXT:    csel w10, w4, wzr, ne
+; NONEON-NOSVE-NEXT:    cmp w9, w8
+; NONEON-NOSVE-NEXT:    csel w8, w9, w8, lo
+; NONEON-NOSVE-NEXT:    and w9, w13, #0xff
+; NONEON-NOSVE-NEXT:    and w10, w10, #0xff
+; NONEON-NOSVE-NEXT:    and w8, w8, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w16, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    and w11, w17, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w18, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    and w11, w0, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w1, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    and w11, w2, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    and w9, w3, #0xff
+; NONEON-NOSVE-NEXT:    cmp w8, w11
+; NONEON-NOSVE-NEXT:    csel w8, w8, w11, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w9
+; NONEON-NOSVE-NEXT:    csel w8, w8, w9, lo
+; NONEON-NOSVE-NEXT:    cmp w8, w10
+; NONEON-NOSVE-NEXT:    csel w8, w8, w10, lo
+; NONEON-NOSVE-NEXT:    and w0, w8, #0x1
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
+; NONEON-NOSVE-NEXT:    ret
   %v0 = bitcast ptr %a to ptr
   %v1 = load <16 x float>, ptr %v0, align 4
   %v2 = fcmp une <16 x float> %v1, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index 726fd28c90ae2..bc0fc7c79391d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -18,6 +19,30 @@ define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) {
 ; CHECK-NEXT:    lsr z0.h, z0.h, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %op)
   ret <4 x i8> %res
 }
@@ -30,6 +55,46 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op)
   ret <8 x i8> %res
 }
@@ -42,6 +107,78 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) {
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op)
   ret <16 x i8> %res
 }
@@ -55,6 +192,144 @@ define void @bitreverse_v32i8(ptr %a) {
 ; CHECK-NEXT:    rbit z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op)
   store <32 x i8> %res, ptr %a
@@ -70,6 +345,21 @@ define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    lsr z0.s, z0.s, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w9, w8, #16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -82,6 +372,30 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -94,6 +408,46 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -107,6 +461,80 @@ define void @bitreverse_v16i16(ptr %a) {
 ; CHECK-NEXT:    rbit z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -121,6 +549,19 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -133,6 +574,24 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -146,6 +605,36 @@ define void @bitreverse_v8i32(ptr %a) {
 ; CHECK-NEXT:    rbit z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    rbit w9, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    rbit w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -160,6 +649,17 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -172,6 +672,19 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -185,6 +698,26 @@ define void @bitreverse_v4i64(ptr %a) {
 ; CHECK-NEXT:    rbit z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bitreverse_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    rbit x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    rbit x9, x8
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    rbit x8, x8
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
@@ -204,6 +737,35 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) {
 ; CHECK-NEXT:    lsr z0.s, z0.s, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op)
   ret <2 x i16> %res
 }
@@ -216,6 +778,30 @@ define <4 x i16> @bswap_v4i16(<4 x i16> %op) {
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op)
   ret <4 x i16> %res
 }
@@ -228,6 +814,46 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) {
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op)
   ret <8 x i16> %res
 }
@@ -241,6 +867,83 @@ define void @bswap_v16i16(ptr %a) {
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op)
   store <16 x i16> %res, ptr %a
@@ -255,6 +958,30 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) {
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op)
   ret <2 x i32> %res
 }
@@ -267,6 +994,46 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) {
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op)
   ret <4 x i32> %res
 }
@@ -280,6 +1047,83 @@ define void @bswap_v8i32(ptr %a) {
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op)
   store <8 x i32> %res, ptr %a
@@ -294,6 +1138,30 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op)
   ret <1 x i64> %res
 }
@@ -306,6 +1174,46 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) {
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op)
   ret <2 x i64> %res
 }
@@ -319,6 +1227,83 @@ define void @bswap_v4i64(ptr %a) {
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: bswap_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op = load <4 x i64>, ptr %a
   %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
index c022bf85e67e9..df019ce2e0ad6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,42 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #2]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp]
+; NONEON-NOSVE-NEXT:    sxtb w11, w8
+; NONEON-NOSVE-NEXT:    sxtb w13, w9
+; NONEON-NOSVE-NEXT:    sxtb w14, w10
+; NONEON-NOSVE-NEXT:    sxtb w15, w12
+; NONEON-NOSVE-NEXT:    ubfx w11, w11, #10, #5
+; NONEON-NOSVE-NEXT:    ubfx w13, w13, #10, #5
+; NONEON-NOSVE-NEXT:    ubfx w14, w14, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w11
+; NONEON-NOSVE-NEXT:    ubfx w11, w15, #10, #5
+; NONEON-NOSVE-NEXT:    add w9, w9, w13
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    add w10, w10, w14
+; NONEON-NOSVE-NEXT:    sxtb w9, w9
+; NONEON-NOSVE-NEXT:    add w11, w12, w11
+; NONEON-NOSVE-NEXT:    sxtb w10, w10
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    sxtb w11, w11
+; NONEON-NOSVE-NEXT:    lsr w9, w9, #5
+; NONEON-NOSVE-NEXT:    lsr w10, w10, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    lsr w8, w11, #5
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i8> %op1, shufflevector (<4 x i8> insertelement (<4 x i8> poison, i8 32, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer)
   ret <4 x i8> %res
 }
@@ -26,6 +63,62 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer)
   ret <8 x i8> %res
 }
@@ -38,6 +131,110 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) {
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer)
   ret <16 x i8> %res
 }
@@ -51,6 +248,208 @@ define void @sdiv_v32i8(ptr %a) {
 ; CHECK-NEXT:    asrd z1.b, p0/m, z1.b, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #61]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #59]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #57]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #23]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #55]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #21]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #53]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #19]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #51]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #49]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #13]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #11]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #9]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #7]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #5]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #3]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp, #1]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrsb w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #10, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxtb w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer)
   store <32 x i8> %res, ptr %a
@@ -66,6 +465,24 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    sxth w10, w8
+; NONEON-NOSVE-NEXT:    sxth w11, w9
+; NONEON-NOSVE-NEXT:    ubfx w10, w10, #26, #5
+; NONEON-NOSVE-NEXT:    ubfx w11, w11, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w10
+; NONEON-NOSVE-NEXT:    add w9, w9, w11
+; NONEON-NOSVE-NEXT:    sbfx w8, w8, #5, #11
+; NONEON-NOSVE-NEXT:    sbfx w9, w9, #5, #11
+; NONEON-NOSVE-NEXT:    stp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i16> %op1, shufflevector (<2 x i16> insertelement (<2 x i16> poison, i16 32, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer)
   ret <2 x i16> %res
 }
@@ -78,6 +495,38 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer)
   ret <4 x i16> %res
 }
@@ -90,6 +539,62 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer)
   ret <8 x i16> %res
 }
@@ -103,6 +608,112 @@ define void @sdiv_v16i16(ptr %a) {
 ; CHECK-NEXT:    asrd z1.h, p0/m, z1.h, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #58]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrsh w8, [sp]
+; NONEON-NOSVE-NEXT:    ubfx w9, w8, #26, #5
+; NONEON-NOSVE-NEXT:    add w8, w8, w9
+; NONEON-NOSVE-NEXT:    sxth w8, w8
+; NONEON-NOSVE-NEXT:    lsr w8, w8, #5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer)
   store <16 x i16> %res, ptr %a
@@ -117,6 +728,23 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer)
   ret <2 x i32> %res
 }
@@ -129,6 +757,32 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer)
   ret <4 x i32> %res
 }
@@ -142,6 +796,52 @@ define void @sdiv_v8i32(ptr %a) {
 ; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w10, w8, #5
+; NONEON-NOSVE-NEXT:    ldr w8, [sp]
+; NONEON-NOSVE-NEXT:    asr w9, w8, #31
+; NONEON-NOSVE-NEXT:    add w8, w8, w9, lsr #27
+; NONEON-NOSVE-NEXT:    asr w8, w8, #5
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer)
   store <8 x i32> %res, ptr %a
@@ -156,6 +856,19 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) {
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    fmov x8, d0
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    str x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer)
   ret <1 x i64> %res
 }
@@ -169,6 +882,23 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) {
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x10, x8, #5
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer)
   ret <2 x i64> %res
 }
@@ -182,6 +912,34 @@ define void @sdiv_v4i64(ptr %a) {
 ; CHECK-NEXT:    asrd z1.d, p0/m, z1.d, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: sdiv_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x10, x8, #5
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x10, x8, #5
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    asr x9, x8, #63
+; NONEON-NOSVE-NEXT:    add x8, x8, x9, lsr #59
+; NONEON-NOSVE-NEXT:    asr x8, x8, #5
+; NONEON-NOSVE-NEXT:    stp x8, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer)
   store <4 x i64> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
index 38aaf860b7298..b66e6d9013573 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
@@ -18,9 +18,15 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) {
 ;
 ; NONEON-NOSVE-LABEL: hang_when_merging_stores_after_legalisation:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    // kill: def $d0 killed $d0 def $q0
-; NONEON-NOSVE-NEXT:    dup v0.4s, v0.s[0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer
   %interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -39,9 +45,25 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2
 ;
 ; NONEON-NOSVE-LABEL: interleave_store_without_splat:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
-; NONEON-NOSVE-NEXT:    stp q0, q2, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %interleaved = shufflevector <8 x i32> %shuffle, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -64,12 +86,40 @@ define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2)
 ;
 ; NONEON-NOSVE-LABEL: interleave_store_legalization:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    zip2 v4.4s, v1.4s, v3.4s
-; NONEON-NOSVE-NEXT:    zip1 v1.4s, v1.4s, v3.4s
-; NONEON-NOSVE-NEXT:    zip2 v3.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    zip1 v0.4s, v0.4s, v2.4s
-; NONEON-NOSVE-NEXT:    stp q1, q4, [x0, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [x0]
+; NONEON-NOSVE-NEXT:    sub sp, sp, #128
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
+; NONEON-NOSVE-NEXT:    stp q1, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q2, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #100]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #108]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr q3, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp q0, q2, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
   %interleaved.vec = shufflevector <8 x i32> %v1, <8 x i32> %v2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11,
                                                                              i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
index 649b13fa8a1e3..a4cf5d608fed6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 
@@ -15,6 +16,18 @@ define <4 x i8> @splat_v4i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <4 x i8> %insert, <4 x i8> undef, <4 x i32> zeroinitializer
   ret <4 x i8> %splat
@@ -26,6 +39,22 @@ define <8 x i8> @splat_v8i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer
   ret <8 x i8> %splat
@@ -37,6 +66,29 @@ define <16 x i8> @splat_v16i8(i8 %a) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %splat
@@ -48,6 +100,31 @@ define void @splat_v32i8(i8 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.b, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #13]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #11]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #9]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #7]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #5]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #3]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w0, [sp, #1]
+; NONEON-NOSVE-NEXT:    strb w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 %a, i64 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   store <32 x i8> %splat, ptr %b
@@ -60,6 +137,15 @@ define <2 x i16> @splat_v2i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <2 x i16> %insert, <2 x i16> undef, <2 x i32> zeroinitializer
   ret <2 x i16> %splat
@@ -71,6 +157,18 @@ define <4 x i16> @splat_v4i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
   ret <4 x i16> %splat
@@ -82,6 +180,21 @@ define <8 x i16> @splat_v8i16(i16 %a) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
   ret <8 x i16> %splat
@@ -93,6 +206,23 @@ define void @splat_v16i16(i16 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #12]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #10]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #4]
+; NONEON-NOSVE-NEXT:    strh w0, [sp, #2]
+; NONEON-NOSVE-NEXT:    strh w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 %a, i64 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
   store <16 x i16> %splat, ptr %b
@@ -105,6 +235,15 @@ define <2 x i32> @splat_v2i32(i32 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer
   ret <2 x i32> %splat
@@ -116,6 +255,15 @@ define <4 x i32> @splat_v4i32(i32 %a) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
   ret <4 x i32> %splat
@@ -127,6 +275,17 @@ define void @splat_v8i32(i32 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, w0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w0, w0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 %a, i64 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
   store <8 x i32> %splat, ptr %b
@@ -139,6 +298,15 @@ define <1 x i64> @splat_v1i64(i64 %a) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str x0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer
   ret <1 x i64> %splat
@@ -150,6 +318,13 @@ define <2 x i64> @splat_v2i64(i64 %a) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x0, x0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %splat
@@ -161,6 +336,15 @@ define void @splat_v4i64(i64 %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    stp q0, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x0, x0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 %a, i64 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   store <4 x i64> %splat, ptr %b
@@ -178,6 +362,16 @@ define <2 x half> @splat_v2f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x half> undef, half %a, i64 0
   %splat = shufflevector <2 x half> %insert, <2 x half> undef, <2 x i32> zeroinitializer
   ret <2 x half> %splat
@@ -190,6 +384,18 @@ define <4 x half> @splat_v4f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x half> undef, half %a, i64 0
   %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer
   ret <4 x half> %splat
@@ -202,6 +408,21 @@ define <8 x half> @splat_v8f16(half %a) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x half> undef, half %a, i64 0
   %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer
   ret <8 x half> %splat
@@ -214,6 +435,23 @@ define void @splat_v16f16(half %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #6]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #4]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #2]
+; NONEON-NOSVE-NEXT:    str h0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half %a, i64 0
   %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
   store <16 x half> %splat, ptr %b
@@ -227,6 +465,15 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x float> undef, float %a, i64 0
   %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer
   ret <2 x float> %splat
@@ -239,6 +486,15 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x float> undef, float %a, i64 0
   %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %splat
@@ -251,6 +507,17 @@ define void @splat_v8f32(float %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s0, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float %a, i64 0
   %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
   store <8 x float> %splat, ptr %b
@@ -261,6 +528,15 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) {
 ; CHECK-LABEL: splat_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <1 x double> undef, double %a, i64 0
   %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer
   ret <1 x double> %splat
@@ -273,6 +549,13 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) {
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <2 x double> undef, double %a, i64 0
   %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer
   ret <2 x double> %splat
@@ -285,6 +568,15 @@ define void @splat_v4f64(double %a, ptr %b) {
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp d0, d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double %a, i64 0
   %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
   store <4 x double> %splat, ptr %b
@@ -301,6 +593,13 @@ define void @splat_imm_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #1 // =0x1
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI24_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI24_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 1, i64 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
   store <32 x i8> %splat, ptr %a
@@ -313,6 +612,13 @@ define void @splat_imm_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #2 // =0x2
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI25_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI25_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 2, i64 0
   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
   store <16 x i16> %splat, ptr %a
@@ -325,6 +631,13 @@ define void @splat_imm_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #3 // =0x3
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI26_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 3, i64 0
   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
   store <8 x i32> %splat, ptr %a
@@ -337,6 +650,13 @@ define void @splat_imm_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #4 // =0x4
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI27_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI27_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 4, i64 0
   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
   store <4 x i64> %splat, ptr %a
@@ -353,6 +673,13 @@ define void @splat_imm_v16f16(ptr %a) {
 ; CHECK-NEXT:    fmov z0.h, #5.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI28_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI28_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <16 x half> undef, half 5.0, i64 0
   %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
   store <16 x half> %splat, ptr %a
@@ -365,6 +692,13 @@ define void @splat_imm_v8f32(ptr %a) {
 ; CHECK-NEXT:    fmov z0.s, #6.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI29_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI29_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <8 x float> undef, float 6.0, i64 0
   %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
   store <8 x float> %splat, ptr %a
@@ -377,6 +711,13 @@ define void @splat_imm_v4f64(ptr %a) {
 ; CHECK-NEXT:    fmov z0.d, #7.00000000
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: splat_imm_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI30_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI30_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %insert = insertelement <4 x double> undef, double 7.0, i64 0
   %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
   store <4 x double> %splat, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index c7435bdbec949..a77ac7832e17c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -12,6 +13,11 @@ define void @store_v4i8(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -22,6 +28,13 @@ define void @store_v8i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI1_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI1_0]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -32,6 +45,13 @@ define void @store_v16i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -42,6 +62,13 @@ define void @store_v32i8(ptr %a) {
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <32 x i8> zeroinitializer, ptr %a
   ret void
 }
@@ -53,6 +80,11 @@ define void @store_v2i16(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str wzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -64,6 +96,18 @@ define void @store_v2f16(ptr %a) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI5_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI5_0]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str w8, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   store <2 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -74,6 +118,13 @@ define void @store_v4i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI6_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI6_0]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -84,6 +135,13 @@ define void @store_v4f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI7_0
+; NONEON-NOSVE-NEXT:    ldr d0, [x8, :lo12:.LCPI7_0]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -94,6 +152,13 @@ define void @store_v8i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI8_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -104,6 +169,13 @@ define void @store_v8f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI9_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI9_0]
+; NONEON-NOSVE-NEXT:    str q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -114,6 +186,13 @@ define void @store_v16i16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI10_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI10_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x i16> zeroinitializer, ptr %a
   ret void
 }
@@ -124,6 +203,13 @@ define void @store_v16f16(ptr %a) {
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI11_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI11_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <16 x half> zeroinitializer, ptr %a
   ret void
 }
@@ -133,6 +219,11 @@ define void @store_v2i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -142,6 +233,11 @@ define void @store_v2f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -151,6 +247,11 @@ define void @store_v4i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -160,6 +261,11 @@ define void @store_v4f32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -170,6 +276,13 @@ define void @store_v8i32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI16_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x i32> zeroinitializer, ptr %a
   ret void
 }
@@ -180,6 +293,13 @@ define void @store_v8f32(ptr %a) {
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI17_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI17_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <8 x float> zeroinitializer, ptr %a
   ret void
 }
@@ -190,6 +310,16 @@ define void @store_v1i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v1i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   store <1 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -200,6 +330,16 @@ define void @store_v1f64(ptr %a) {
 ; CHECK-NEXT:    fmov d0, xzr
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v1f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #16
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    str xzr, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    str d0, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   store <1 x double> zeroinitializer, ptr %a
   ret void
 }
@@ -209,6 +349,11 @@ define void @store_v2i64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -218,6 +363,11 @@ define void @store_v2f64(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp xzr, xzr, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp xzr, xzr, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <2 x double> zeroinitializer, ptr %a
   ret void
 }
@@ -228,6 +378,13 @@ define void @store_v4i64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI22_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI22_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x i64> zeroinitializer, ptr %a
   ret void
 }
@@ -238,6 +395,13 @@ define void @store_v4f64(ptr %a) {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    stp q0, q0, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI23_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI23_0]
+; NONEON-NOSVE-NEXT:    stp q0, q0, [x0]
+; NONEON-NOSVE-NEXT:    ret
   store <4 x double> zeroinitializer, ptr %a
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
index 9e04fc236836c..a9f4d92b1e6b6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 ; Test we can code generater patterns of the form:
@@ -23,6 +24,16 @@ define void @subvector_v4i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0, #2]
+; NONEON-NOSVE-NEXT:    ldrb w9, [x0, #1]
+; NONEON-NOSVE-NEXT:    ldrb w10, [x0]
+; NONEON-NOSVE-NEXT:    strh w8, [x1, #2]
+; NONEON-NOSVE-NEXT:    strb w9, [x1, #1]
+; NONEON-NOSVE-NEXT:    strb w10, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i8>, ptr %in
   br label %bb1
 
@@ -37,6 +48,12 @@ define void @subvector_v8i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i8>, ptr %in
   br label %bb1
 
@@ -51,6 +68,12 @@ define void @subvector_v16i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v16i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i8>, ptr %in
   br label %bb1
 
@@ -65,6 +88,12 @@ define void @subvector_v32i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v32i8:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   br label %bb1
 
@@ -81,6 +110,12 @@ define void @subvector_v2i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i16>, ptr %in
   br label %bb1
 
@@ -95,6 +130,12 @@ define void @subvector_v4i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i16>, ptr %in
   br label %bb1
 
@@ -109,6 +150,12 @@ define void @subvector_v8i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %in
   br label %bb1
 
@@ -123,6 +170,12 @@ define void @subvector_v16i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v16i16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   br label %bb1
 
@@ -138,6 +191,12 @@ define void @subvector_v2i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2i32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i32>, ptr %in
   br label %bb1
 
@@ -152,6 +211,12 @@ define void @subvector_v4i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %in
   br label %bb1
 
@@ -166,6 +231,12 @@ define void @subvector_v8i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8i32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   br label %bb1
 
@@ -181,6 +252,12 @@ define void @subvector_v2i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2i64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %in
   br label %bb1
 
@@ -195,6 +272,12 @@ define void @subvector_v4i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4i64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   br label %bb1
 
@@ -210,6 +293,12 @@ define void @subvector_v2f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    str w8, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x half>, ptr %in
   br label %bb1
 
@@ -224,6 +313,12 @@ define void @subvector_v4f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x half>, ptr %in
   br label %bb1
 
@@ -238,6 +333,12 @@ define void @subvector_v8f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x half>, ptr %in
   br label %bb1
 
@@ -252,6 +353,12 @@ define void @subvector_v16f16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v16f16:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x half>, ptr %in
   br label %bb1
 
@@ -267,6 +374,12 @@ define void @subvector_v2f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2f32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr d0, [x0]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x float>, ptr %in
   br label %bb1
 
@@ -281,6 +394,12 @@ define void @subvector_v4f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4f32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x float>, ptr %in
   br label %bb1
 
@@ -295,6 +414,12 @@ define void @subvector_v8f32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v8f32:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x float>,ptr %in
   br label %bb1
 
@@ -310,6 +435,12 @@ define void @subvector_v2f64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v2f64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x double>, ptr %in
   br label %bb1
 
@@ -324,6 +455,12 @@ define void @subvector_v4f64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: subvector_v4f64:
+; NONEON-NOSVE:       // %bb.0: // %bb1
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1]
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x double>, ptr %in
   br label %bb1
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index b34fe438a063a..30682751037fe 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -12,6 +13,32 @@ define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v8i16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #27]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
   %val = trunc <8 x i16> %a to <8 x i8>
   store <8 x i8> %val, ptr %dest
@@ -25,6 +52,20 @@ define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v4i32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [x1, #3]
+; NONEON-NOSVE-NEXT:    strb w9, [x1, #2]
+; NONEON-NOSVE-NEXT:    strb w11, [x1, #1]
+; NONEON-NOSVE-NEXT:    strb w10, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i8>
   store <4 x i8> %val, ptr %dest
@@ -38,6 +79,22 @@ define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v4i32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #30]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #26]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = trunc <4 x i32> %a to <4 x i16>
   store <4 x i16> %val, ptr %dest
@@ -51,6 +108,18 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v2i64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    str d0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = trunc <2 x i64> %a to <2 x i32>
   store <2 x i32> %val, ptr %dest
@@ -66,6 +135,19 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) {
 ; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: store_trunc_v2i256i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr x8, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldr x9, [x0]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %a = load <2 x i256>, ptr %ap
   %val = trunc <2 x i256> %a to <2 x i64>
   store <2 x i64> %val, ptr %dest
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index 9e56462df3889..bc046059f0bd5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -19,6 +20,46 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i16_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #43]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #39]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #38]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #37]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #35]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = trunc <16 x i16> %a to <16 x i8>
   ret <16 x i8> %b
@@ -41,6 +82,129 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.b, z2.b, z2.b
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i16_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #208
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #112] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #128] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #144] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #160] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w29, [sp, #52]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #50]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #54]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #26]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w21, [sp, #20]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #92]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #44]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #91]
+; NONEON-NOSVE-NEXT:    add w9, w28, w28
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #46]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #89]
+; NONEON-NOSVE-NEXT:    add w9, w26, w26
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #42]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #36]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #18]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #87]
+; NONEON-NOSVE-NEXT:    add w9, w24, w24
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #38]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #60]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #85]
+; NONEON-NOSVE-NEXT:    add w9, w22, w22
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #62]
+; NONEON-NOSVE-NEXT:    add w6, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #84]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    add w5, w13, w13
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #83]
+; NONEON-NOSVE-NEXT:    add w9, w20, w20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #81]
+; NONEON-NOSVE-NEXT:    add w9, w7, w7
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #34]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #80]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #76]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #111]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #78]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #110]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #109]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #74]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #108]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #68]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #107]
+; NONEON-NOSVE-NEXT:    add w9, w17, w17
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #106]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldrh w30, [sp, #58]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #105]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #104]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #103]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #102]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #101]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #100]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #99]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #98]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #95]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #94]
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #93]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #97]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #160] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #144] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #128] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #112] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #208
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i16>, ptr %in
   %b = trunc <32 x i16> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -76,6 +240,280 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v64i16_v64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #448
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #152] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #238]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #230]
+; NONEON-NOSVE-NEXT:    add w21, w8, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #228]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #226]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #270]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #268]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #266]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #262]
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #258]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #254]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #210]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #252]
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #250]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #212]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #214]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #246]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #244]
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #242]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #218]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #174]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #220]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #222]
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #172]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #170]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #178]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #168]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #166]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #182]
+; NONEON-NOSVE-NEXT:    ldrh w29, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldrh w30, [sp, #162]
+; NONEON-NOSVE-NEXT:    strb w21, [sp, #335]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #186]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #188]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #190]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #196]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #204]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #236]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #234]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #334]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #333]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #331]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #330]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #329]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #327]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #326]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #325]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #323]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #322]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #321]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #320]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #319]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #318]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #317]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #315]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #314]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #313]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #312]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #311]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #310]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #309]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #307]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #306]
+; NONEON-NOSVE-NEXT:    add w8, w30, w30
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #305]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #303]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #302]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #301]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #299]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #297]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #295]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #294]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #293]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #291]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #290]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #289]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #288]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #351]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #350]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #349]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #347]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #346]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #345]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #343]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #342]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #341]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #339]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #338]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #337]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #152] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x8]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x8, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #448
+; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i16>, ptr %in
   %b = trunc <64 x i16> %a to <64 x i8>
   %c = add <64 x i8> %b, %b
@@ -133,6 +571,602 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v128i16_v128i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #800
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #408] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #224]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #592]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #606]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldrh w10, [sp, #600]
+; NONEON-NOSVE-NEXT:    stp q18, q20, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldrh w11, [sp, #598]
+; NONEON-NOSVE-NEXT:    ldrh w12, [sp, #596]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp q17, q23, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w13, [sp, #594]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldrh w14, [sp, #592]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #464]
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str w8, [sp, #404] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #434]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #560]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #400] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    str q5, [sp, #544]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #396] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #438]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #608]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #392] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    ldrh w15, [sp, #638]
+; NONEON-NOSVE-NEXT:    stp q7, q21, [sp, #640]
+; NONEON-NOSVE-NEXT:    ldrh w16, [sp, #636]
+; NONEON-NOSVE-NEXT:    ldrh w17, [sp, #634]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #388] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #442]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #666]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldrh w18, [sp, #632]
+; NONEON-NOSVE-NEXT:    ldrh w0, [sp, #630]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #384] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    ldrh w1, [sp, #628]
+; NONEON-NOSVE-NEXT:    ldrh w2, [sp, #626]
+; NONEON-NOSVE-NEXT:    ldrh w3, [sp, #624]
+; NONEON-NOSVE-NEXT:    ldrh w4, [sp, #622]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #380] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #446]
+; NONEON-NOSVE-NEXT:    ldrh w5, [sp, #620]
+; NONEON-NOSVE-NEXT:    ldrh w6, [sp, #618]
+; NONEON-NOSVE-NEXT:    ldrh w7, [sp, #616]
+; NONEON-NOSVE-NEXT:    ldrh w19, [sp, #614]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #376] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    ldrh w20, [sp, #612]
+; NONEON-NOSVE-NEXT:    ldrh w21, [sp, #610]
+; NONEON-NOSVE-NEXT:    ldrh w22, [sp, #608]
+; NONEON-NOSVE-NEXT:    ldrh w23, [sp, #430]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #372] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #482]
+; NONEON-NOSVE-NEXT:    ldrh w24, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldrh w25, [sp, #426]
+; NONEON-NOSVE-NEXT:    ldrh w26, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldrh w27, [sp, #422]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #368] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #484]
+; NONEON-NOSVE-NEXT:    ldrh w28, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldrh w29, [sp, #418]
+; NONEON-NOSVE-NEXT:    strb w30, [sp, #767]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #364] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #486]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #360] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #488]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #356] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #490]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #352] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #492]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #494]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #344] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #450]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #336] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #454]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #328] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #458]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #320] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #462]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #312] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #304] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #470]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #296] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #474]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #288] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #478]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #280] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #656]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #658]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #272] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #660]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #268] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #662]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #264] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #664]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #260] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #668]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #252] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #670]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #528]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #244] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #530]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #532]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #236] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #534]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #536]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #228] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #538]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #540]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #220] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #542]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #212] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #498]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #500]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #204] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #502]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #504]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #196] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #506]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #508]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #188] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #510]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #180] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #514]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #516]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #172] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #518]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #164] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #522]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #524]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #156] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #526]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #640]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #148] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #642]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #644]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #140] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #646]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #648]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #132] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #650]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #652]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #124] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #654]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #576]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #116] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #578]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #580]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #108] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #582]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #584]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #100] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #586]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #588]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #92] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #590]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #544]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #84] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #546]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #548]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #76] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #550]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #552]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #68] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #554]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #556]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #558]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #560]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #562]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #564]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #566]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #568]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #570]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #572]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #574]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #602]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #604]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #765]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #764]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #763]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #762]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #761]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #760]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #759]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #758]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #757]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #756]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #755]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #754]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #753]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #752]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #751]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #750]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #749]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #748]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #747]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #746]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #745]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #744]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #743]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #742]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #741]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #740]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #739]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #738]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #737]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #766]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #736]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #736]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #735]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #734]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #733]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #732]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #731]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #730]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #729]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #728]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #727]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #726]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #725]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #724]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #723]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #722]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #721]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #720]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #783]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #782]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #781]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #780]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #779]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #778]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #777]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #776]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #775]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #774]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #773]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #772]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #771]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #770]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #769]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #768]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #152] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #719]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #156] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #718]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #717]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #164] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #716]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #168] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #715]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #172] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #714]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #713]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #180] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #712]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #711]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #188] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #710]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #709]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #196] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #708]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #200] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #707]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #204] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #706]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #208] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #705]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #212] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #704]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q3, [sp, #704]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #799]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #220] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #798]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #224] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #797]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #228] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #796]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #232] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #795]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #236] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #794]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #240] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #793]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #244] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #792]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #248] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #791]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #252] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #790]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #256] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #789]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #260] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #788]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #264] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #787]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #786]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #272] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #785]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #276] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #784]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #280] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q4, q7, [sp, #768]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #687]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #284] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #686]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #288] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #685]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #292] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #684]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #296] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #683]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #300] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #682]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #304] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #681]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #308] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #680]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #312] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #679]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #316] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #678]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #320] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #677]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #324] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #676]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #675]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #674]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #336] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #673]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #340] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #672]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #344] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #703]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #348] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #702]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #352] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #701]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #356] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #700]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #360] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #699]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #364] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #698]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #368] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #697]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #372] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #696]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #376] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #695]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #380] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #694]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #384] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #693]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #388] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #692]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #392] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #691]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #396] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #690]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #400] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #689]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #404] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #688]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #408] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #672]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x8]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x8, #32]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x8, #64]
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x8, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #800
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %a = load <128 x i16>, ptr %in
   %b = trunc <128 x i16> %a to <128 x i8>
   %c = add <128 x i8> %b, %b
@@ -155,6 +1189,26 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i32_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #45]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #43]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #41]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i8>
   ret <8 x i8> %b
@@ -178,6 +1232,42 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i32_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #77]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #75]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #73]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #71]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #69]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #67]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #65]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i8>
   ret <16 x i8> %b
@@ -215,6 +1305,117 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.b, z3.b, z3.b
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i32_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #272
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #192] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp w27, w28, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w25, w26, [sp, #104]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w10, w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp w23, w24, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w21, w22, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w19, w20, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #155]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #154]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #153]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    ldp w4, w7, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #151]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #150]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldp w2, w3, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #149]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #148]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldp w18, w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #147]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #146]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldp w16, w17, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #145]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #175]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #174]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #173]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #171]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #170]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #169]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #167]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #166]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    ldp w29, w30, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #165]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #164]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #163]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #162]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #159]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #158]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #157]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #156]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #161]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #192] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #176] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #272
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -279,6 +1480,277 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v64i32_v64i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #480
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #152] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #128]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #160]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    str q18, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #304]
+; NONEON-NOSVE-NEXT:    stp q21, q19, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #292]
+; NONEON-NOSVE-NEXT:    add w20, w8, w8
+; NONEON-NOSVE-NEXT:    stp q20, q23, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #312]
+; NONEON-NOSVE-NEXT:    stp q3, q17, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #404]
+; NONEON-NOSVE-NEXT:    str q7, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #396]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #392]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #408]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #412]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #332]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #328]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #324]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #388]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #384]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #344]
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #340]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w6, [sp, #336]
+; NONEON-NOSVE-NEXT:    stp q6, q5, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #380]
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #376]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #372]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #364]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #352]
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #463]
+; NONEON-NOSVE-NEXT:    add w20, w22, w22
+; NONEON-NOSVE-NEXT:    strb w20, [sp, #462]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w29, w28, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldp w8, w30, [sp, #160]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #461]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    add w8, w9, w9
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #459]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #458]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #457]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #455]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #454]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #453]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #451]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #450]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #449]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #447]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #446]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #445]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #443]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #442]
+; NONEON-NOSVE-NEXT:    add w8, w21, w21
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #441]
+; NONEON-NOSVE-NEXT:    add w8, w23, w23
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #439]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #438]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #437]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #435]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #434]
+; NONEON-NOSVE-NEXT:    add w8, w30, w30
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #433]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #431]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #430]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #429]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #427]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #426]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #425]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #423]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #422]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #421]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #419]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #417]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #416]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #479]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #478]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #477]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #475]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #474]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #473]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #471]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #470]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #469]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #467]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #144] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #465]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #148] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #152] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #448]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x8]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x8, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #480
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i8>
   %c = add <64 x i8> %b, %b
@@ -300,6 +1772,26 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i32_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #38]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #34]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = trunc <8 x i32> %a to <8 x i16>
   ret <8 x i16> %b
@@ -322,6 +1814,58 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.h, z2.h, z2.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i32_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp w2, w3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w4, w5, [sp, #8]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldp w18, w0, [sp]
+; NONEON-NOSVE-NEXT:    ldp w16, w17, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #76]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    add w8, w5, w5
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #70]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #68]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #94]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #92]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #90]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #86]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #84]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #82]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i32>, ptr %in
   %b = trunc <16 x i32> %a to <16 x i16>
   %c = add <16 x i16> %b, %b
@@ -357,6 +1901,119 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i32_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #304
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #224] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #240] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #256] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #272] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldp w27, w28, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp w25, w26, [sp, #104]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #288] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w10, w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp w23, w24, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp w21, w22, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp w19, w20, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #208] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #182]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #180]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #178]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    ldp w4, w7, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #174]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #172]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldp w2, w3, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #170]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldp w18, w0, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #166]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #164]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldp w16, w17, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #162]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldp w14, w15, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #158]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #156]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldp w12, w13, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #154]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    ldp w10, w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #150]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #148]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #146]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #206]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #204]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    ldp w29, w30, [sp, #80]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #202]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #198]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #196]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #190]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #188]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #186]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #184]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #288] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #194]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #272] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #256] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #240] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #224] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #208] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #304
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i16>
   %c = add <32 x i16> %b, %b
@@ -414,6 +2071,280 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v64i32_v64i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    sub sp, sp, #528
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    mov x5, x1
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #128]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    stp q17, q23, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #320]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w21, w8, w8
+; NONEON-NOSVE-NEXT:    stp q18, q20, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp q7, q21, [sp, #368]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #380]
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #376]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #288]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #336]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #296]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #360]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #348]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #344]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #340]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #336]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #120] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w6, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #288]
+; NONEON-NOSVE-NEXT:    str q5, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #316]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #284]
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #280]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #276]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #304]
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #494]
+; NONEON-NOSVE-NEXT:    add w21, w23, w23
+; NONEON-NOSVE-NEXT:    strh w21, [sp, #492]
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldp w0, w18, [sp, #152]
+; NONEON-NOSVE-NEXT:    ldp w2, w1, [sp, #144]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #96] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #384]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #388]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #392]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #396]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #260]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #56] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #268]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #48] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #224]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #232]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #240]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp, #248]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #368]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #372]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #364]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #490]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #488]
+; NONEON-NOSVE-NEXT:    add w8, w9, w9
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #486]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #484]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #482]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #480]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #478]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #476]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #474]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #472]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #470]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #468]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #466]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #464]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #462]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #460]
+; NONEON-NOSVE-NEXT:    add w8, w6, w6
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #458]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #456]
+; NONEON-NOSVE-NEXT:    add w8, w19, w19
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #454]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #452]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #450]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #448]
+; NONEON-NOSVE-NEXT:    add w8, w25, w25
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #510]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #508]
+; NONEON-NOSVE-NEXT:    add w8, w27, w27
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #506]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #504]
+; NONEON-NOSVE-NEXT:    add w8, w29, w29
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #502]
+; NONEON-NOSVE-NEXT:    add w8, w30, w30
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #500]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #464]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #498]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #496]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #446]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #444]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #442]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #28] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #440]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #438]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #36] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #436]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #434]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #432]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q6, q3, [sp, #432]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #526]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #52] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #524]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #522]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #60] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #520]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #518]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #68] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #516]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #72] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #514]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #512]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #80] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q4, q7, [sp, #496]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #414]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #84] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #412]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #88] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #410]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #92] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #408]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #406]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #100] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #404]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #104] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #402]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #400]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #430]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #428]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #120] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #426]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #124] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #424]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #128] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #422]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #132] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #420]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #136] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #418]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #140] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #416]
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #400]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x5]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x5, #32]
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x5, #64]
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x5, #96]
+; NONEON-NOSVE-NEXT:    add sp, sp, #528
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i16>
   %c = add <64 x i16> %b, %b
@@ -437,6 +2368,20 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v4i64_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i8>
   ret <4 x i8> %b
@@ -461,6 +2406,31 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i64_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #79]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #77]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #75]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #72]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i8>
   ret <8 x i8> %b
@@ -499,6 +2469,51 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i64_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #144
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #96]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    ldp q4, q5, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q6, q7, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #80]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q7, q5, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #142]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #143]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #48]
+; NONEON-NOSVE-NEXT:    strb w10, [sp, #141]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #138]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #120]
+; NONEON-NOSVE-NEXT:    strb w11, [sp, #139]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #137]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #112]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #72]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #135]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #134]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #104]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #133]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #96]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #131]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #130]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #129]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    add sp, sp, #144
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i8>
   ret <16 x i8> %b
@@ -565,6 +2580,143 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i64_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #416
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #128]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #320] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #224]
+; NONEON-NOSVE-NEXT:    str x1, [sp, #24] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #160]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp q21, q19, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #216]
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp q20, q23, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q6, q5, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q3, q17, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #272]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #264]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #298]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #248]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #232]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    str w8, [sp, #20] // 4-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #200]
+; NONEON-NOSVE-NEXT:    str q7, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #104]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #112]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    str q18, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #40]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #299]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #80]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #294]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #88]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #297]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #56]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #292]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #144]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #295]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #136]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #290]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #120]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #293]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #152]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #288]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w1, [sp, #280]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #291]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #160]
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #318]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #168]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #289]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #319]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #314]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #317]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #315]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #310]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #313]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #311]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #306]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #20] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #309]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #303]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #302]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #307]
+; NONEON-NOSVE-NEXT:    add w8, w1, w1
+; NONEON-NOSVE-NEXT:    strb w5, [sp, #301]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w6, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #305]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #24] // 8-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strb w9, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x8]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #320] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #416
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i8>
   %c = add <32 x i8> %b, %b
@@ -587,6 +2739,20 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v4i64_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #42]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i16>
   ret <4 x i16> %b
@@ -610,6 +2776,31 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i64_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q2, q3, [x0]
+; NONEON-NOSVE-NEXT:    str q1, [sp, #48]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #78]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldp x8, x11, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w10, [sp, #74]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w11, [sp, #70]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i16>
   ret <8 x i16> %b
@@ -647,6 +2838,70 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.h, z3.h, z3.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i64_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #160
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #142]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #140]
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #40]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #138]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #136]
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #24]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #134]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #132]
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #130]
+; NONEON-NOSVE-NEXT:    add w9, w17, w17
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #128]
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #120]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #158]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #156]
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #154]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #150]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #148]
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #146]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #128]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #160
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i16>
   %c = add <16 x i16> %b, %b
@@ -711,6 +2966,144 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i64_v32i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #432
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #352] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #64]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #368] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #384] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #128]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #336] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #224]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #192]
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #160]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp q21, q19, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #200]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    stp q20, q23, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp q6, q5, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #88]
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q3, q17, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #224]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #232]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #256]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #264]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #248]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #308]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #208]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #184]
+; NONEON-NOSVE-NEXT:    str q7, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #40]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #304]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #310]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #306]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #24]
+; NONEON-NOSVE-NEXT:    str q18, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #120]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #302]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #64]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #300]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #72]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #298]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #96]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #104]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #294]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #128]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #292]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #136]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #290]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #144]
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #288]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #152]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #286]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #284]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #282]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #278]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #276]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #274]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #334]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #332]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #330]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #326]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #324]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #318]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    strh w5, [sp, #314]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #272]
+; NONEON-NOSVE-NEXT:    strh w6, [sp, #312]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #322]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    strh w9, [sp, #320]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #384] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #304]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #368] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #352] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #336] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #432
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i16>
   %c = add <32 x i16> %b, %b
@@ -732,6 +3125,18 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind {
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v4i64_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldp x8, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %a = load <4 x i64>, ptr %in
   %b = trunc <4 x i64> %a to <4 x i32>
   ret <4 x i32> %b
@@ -754,6 +3159,38 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    add z1.s, z2.s, z2.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v8i64_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #96
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp]
+; NONEON-NOSVE-NEXT:    stp q2, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #56]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #88]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    add sp, sp, #96
+; NONEON-NOSVE-NEXT:    ret
   %a = load <8 x i64>, ptr %in
   %b = trunc <8 x i64> %a to <8 x i32>
   %c = add <8 x i32> %b, %b
@@ -789,6 +3226,64 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
 ; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v16i64_v16i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #192
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0, #32]
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp q2, q4, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q3, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #72]
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #104]
+; NONEON-NOSVE-NEXT:    stp q5, q7, [sp]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldr w5, [sp, #88]
+; NONEON-NOSVE-NEXT:    stp q6, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #168]
+; NONEON-NOSVE-NEXT:    add w9, w3, w3
+; NONEON-NOSVE-NEXT:    add w8, w2, w2
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #160]
+; NONEON-NOSVE-NEXT:    add w9, w5, w5
+; NONEON-NOSVE-NEXT:    add w8, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #152]
+; NONEON-NOSVE-NEXT:    add w9, w0, w0
+; NONEON-NOSVE-NEXT:    add w8, w18, w18
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #144]
+; NONEON-NOSVE-NEXT:    add w9, w17, w17
+; NONEON-NOSVE-NEXT:    add w8, w16, w16
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #136]
+; NONEON-NOSVE-NEXT:    add w9, w15, w15
+; NONEON-NOSVE-NEXT:    add w8, w14, w14
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #128]
+; NONEON-NOSVE-NEXT:    add w9, w13, w13
+; NONEON-NOSVE-NEXT:    add w8, w12, w12
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #184]
+; NONEON-NOSVE-NEXT:    add w9, w11, w11
+; NONEON-NOSVE-NEXT:    add w8, w10, w10
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldp q1, q3, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldp q2, q0, [sp, #160]
+; NONEON-NOSVE-NEXT:    stp q3, q2, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x1, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #192
+; NONEON-NOSVE-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i32>
   %c = add <16 x i32> %b, %b
@@ -846,6 +3341,149 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
 ; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: trunc_v32i64_v32i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #496
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0, #32]
+; NONEON-NOSVE-NEXT:    stp x28, x27, [sp, #416] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q17, q16, [x0, #192]
+; NONEON-NOSVE-NEXT:    stp x26, x25, [sp, #432] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    stp x24, x23, [sp, #448] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q23, q22, [x0, #224]
+; NONEON-NOSVE-NEXT:    stp x22, x21, [sp, #464] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q3, q2, [x0]
+; NONEON-NOSVE-NEXT:    stp x20, x19, [sp, #480] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q5, q4, [x0, #96]
+; NONEON-NOSVE-NEXT:    stp x29, x30, [sp, #400] // 16-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldp q7, q6, [x0, #64]
+; NONEON-NOSVE-NEXT:    ldp q19, q18, [x0, #160]
+; NONEON-NOSVE-NEXT:    ldp q21, q20, [x0, #128]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #192]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #192]
+; NONEON-NOSVE-NEXT:    stp q17, q23, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #200]
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q4, q6, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w12, [sp, #48]
+; NONEON-NOSVE-NEXT:    add w6, w8, w8
+; NONEON-NOSVE-NEXT:    add w5, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp q18, q20, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldr w25, [sp, #160]
+; NONEON-NOSVE-NEXT:    ldr w26, [sp, #168]
+; NONEON-NOSVE-NEXT:    str q5, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w21, [sp, #176]
+; NONEON-NOSVE-NEXT:    ldr w22, [sp, #184]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w23, [sp, #144]
+; NONEON-NOSVE-NEXT:    ldr w24, [sp, #152]
+; NONEON-NOSVE-NEXT:    str q3, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #208]
+; NONEON-NOSVE-NEXT:    ldr w4, [sp, #112]
+; NONEON-NOSVE-NEXT:    stp w8, w10, [sp, #8] // 8-byte Folded Spill
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #216]
+; NONEON-NOSVE-NEXT:    ldr w27, [sp, #16]
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    ldr w28, [sp, #24]
+; NONEON-NOSVE-NEXT:    stp q22, q16, [sp, #64]
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w9, [sp, #344]
+; NONEON-NOSVE-NEXT:    add w9, w27, w27
+; NONEON-NOSVE-NEXT:    str w8, [sp, #348]
+; NONEON-NOSVE-NEXT:    add w8, w28, w28
+; NONEON-NOSVE-NEXT:    ldr w7, [sp, #120]
+; NONEON-NOSVE-NEXT:    stp q7, q21, [sp, #240]
+; NONEON-NOSVE-NEXT:    ldr w18, [sp, #128]
+; NONEON-NOSVE-NEXT:    ldr w0, [sp, #136]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #340]
+; NONEON-NOSVE-NEXT:    add w8, w26, w26
+; NONEON-NOSVE-NEXT:    ldr w19, [sp, #240]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #336]
+; NONEON-NOSVE-NEXT:    add w9, w25, w25
+; NONEON-NOSVE-NEXT:    ldr w20, [sp, #248]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #332]
+; NONEON-NOSVE-NEXT:    add w8, w24, w24
+; NONEON-NOSVE-NEXT:    ldr w16, [sp, #256]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #328]
+; NONEON-NOSVE-NEXT:    add w9, w23, w23
+; NONEON-NOSVE-NEXT:    ldr w17, [sp, #264]
+; NONEON-NOSVE-NEXT:    str q19, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldr w14, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr w15, [sp, #72]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #324]
+; NONEON-NOSVE-NEXT:    add w8, w22, w22
+; NONEON-NOSVE-NEXT:    ldr w2, [sp, #96]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #320]
+; NONEON-NOSVE-NEXT:    add w9, w21, w21
+; NONEON-NOSVE-NEXT:    ldr w3, [sp, #104]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #380]
+; NONEON-NOSVE-NEXT:    add w8, w20, w20
+; NONEON-NOSVE-NEXT:    ldr w13, [sp, #56]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #376]
+; NONEON-NOSVE-NEXT:    add w9, w19, w19
+; NONEON-NOSVE-NEXT:    ldr w10, [sp, #80]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #372]
+; NONEON-NOSVE-NEXT:    add w8, w7, w7
+; NONEON-NOSVE-NEXT:    ldr w11, [sp, #88]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #368]
+; NONEON-NOSVE-NEXT:    add w9, w4, w4
+; NONEON-NOSVE-NEXT:    ldr w29, [sp, #224]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #316]
+; NONEON-NOSVE-NEXT:    add w8, w3, w3
+; NONEON-NOSVE-NEXT:    ldr w30, [sp, #232]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #312]
+; NONEON-NOSVE-NEXT:    add w9, w2, w2
+; NONEON-NOSVE-NEXT:    str w8, [sp, #308]
+; NONEON-NOSVE-NEXT:    add w8, w0, w0
+; NONEON-NOSVE-NEXT:    str w9, [sp, #304]
+; NONEON-NOSVE-NEXT:    add w9, w18, w18
+; NONEON-NOSVE-NEXT:    str w8, [sp, #396]
+; NONEON-NOSVE-NEXT:    add w8, w17, w17
+; NONEON-NOSVE-NEXT:    str w9, [sp, #392]
+; NONEON-NOSVE-NEXT:    add w9, w16, w16
+; NONEON-NOSVE-NEXT:    str w8, [sp, #388]
+; NONEON-NOSVE-NEXT:    add w8, w15, w15
+; NONEON-NOSVE-NEXT:    str w9, [sp, #384]
+; NONEON-NOSVE-NEXT:    add w9, w14, w14
+; NONEON-NOSVE-NEXT:    str w8, [sp, #284]
+; NONEON-NOSVE-NEXT:    add w8, w13, w13
+; NONEON-NOSVE-NEXT:    str w9, [sp, #280]
+; NONEON-NOSVE-NEXT:    add w9, w12, w12
+; NONEON-NOSVE-NEXT:    str w8, [sp, #276]
+; NONEON-NOSVE-NEXT:    add w8, w11, w11
+; NONEON-NOSVE-NEXT:    str w9, [sp, #272]
+; NONEON-NOSVE-NEXT:    add w9, w10, w10
+; NONEON-NOSVE-NEXT:    str w8, [sp, #300]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    str w9, [sp, #296]
+; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12] // 4-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add w8, w8, w8
+; NONEON-NOSVE-NEXT:    str w5, [sp, #364]
+; NONEON-NOSVE-NEXT:    add w5, w30, w30
+; NONEON-NOSVE-NEXT:    add w9, w9, w9
+; NONEON-NOSVE-NEXT:    str w6, [sp, #360]
+; NONEON-NOSVE-NEXT:    add w6, w29, w29
+; NONEON-NOSVE-NEXT:    str w5, [sp, #356]
+; NONEON-NOSVE-NEXT:    ldp q6, q3, [sp, #304]
+; NONEON-NOSVE-NEXT:    str w6, [sp, #352]
+; NONEON-NOSVE-NEXT:    ldp q4, q7, [sp, #368]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #292]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #336]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #288]
+; NONEON-NOSVE-NEXT:    ldp x20, x19, [sp, #480] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp q5, q2, [sp, #272]
+; NONEON-NOSVE-NEXT:    stp q4, q3, [x1, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldp x22, x21, [sp, #464] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q7, q6, [x1, #64]
+; NONEON-NOSVE-NEXT:    ldp x24, x23, [sp, #448] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    stp q2, q5, [x1, #96]
+; NONEON-NOSVE-NEXT:    ldp x26, x25, [sp, #432] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x28, x27, [sp, #416] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    ldp x29, x30, [sp, #400] // 16-byte Folded Reload
+; NONEON-NOSVE-NEXT:    add sp, sp, #496
+; NONEON-NOSVE-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i32>
   %c = add <32 x i32> %b, %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index 304823c9e6414..323f5f56a2c08 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,21 @@ define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    tbl z0.h, { z0.h }, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldur w8, [sp, #2]
+; NONEON-NOSVE-NEXT:    ror w8, w8, #16
+; NONEON-NOSVE-NEXT:    str w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
   ret <4 x i8> %ret
 }
@@ -28,6 +44,23 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    insr z1.b, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #22]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i8> %ret
 }
@@ -42,6 +75,24 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    insr z1.b, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #45]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #41]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #33]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #15]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <16 x i8> %op1, <16 x i8> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
                                                                    i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
   ret <16 x i8> %ret
@@ -60,6 +111,39 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.b, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #31]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #29]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #25]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #17]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #63]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #79]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    sturh w8, [sp, #77]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #73]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #65]
+; NONEON-NOSVE-NEXT:    ldrb w8, [sp, #47]
+; NONEON-NOSVE-NEXT:    strb w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
   %ret = shufflevector <32 x i8> %op1, <32 x i8> %op2, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,
@@ -78,6 +162,16 @@ define <2 x i16> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldp w9, w8, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i16> %op1, <2 x i16> %op2, <2 x i32> <i32 1, i32 0>
   ret <2 x i16> %ret
 }
@@ -92,6 +186,21 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-NEXT:    insr z1.h, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %ret
 }
@@ -106,6 +215,22 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    insr z1.h, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x i16> %ret
 }
@@ -123,6 +248,35 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.h, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
+; NONEON-NOSVE-NEXT:    strh w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
   %ret = shufflevector <16 x i16> %op1, <16 x i16> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -141,6 +295,17 @@ define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-NEXT:    insr z1.s, w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i32> %op1, <2 x i32> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %ret
 }
@@ -155,6 +320,20 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-NEXT:    insr z1.s, w8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x i32> %op1, <4 x i32> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i32> %ret
 }
@@ -172,6 +351,30 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.s, w8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str w8, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
   %ret = shufflevector <8 x i32> %op1, <8 x i32> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -189,6 +392,16 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-NEXT:    insr z1.d, x8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x i64> %op1, <2 x i64> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %ret
 }
@@ -206,6 +419,24 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, x8
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x9, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp x10, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x8, x11, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
   %ret = shufflevector <4 x i64> %op1, <4 x i64> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -223,6 +454,21 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-NEXT:    insr z0.h, h2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x half> %ret
 }
@@ -236,6 +482,22 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-NEXT:    insr z0.h, h2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #42]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #34]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x half> %ret
 }
@@ -251,6 +513,35 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.h, h2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16f16:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #26]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #18]
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #78]
+; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur w8, [sp, #74]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #66]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
   %ret = shufflevector <16 x half> %op1, <16 x half> %op2, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22,
@@ -268,6 +559,17 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2)
 ; CHECK-NEXT:    insr z0.s, s2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %ret
 }
@@ -281,6 +583,20 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2)
 ; CHECK-NEXT:    insr z0.s, s2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #36]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x float> %ret
 }
@@ -296,6 +612,30 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.s, s2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f32:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #28]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #56]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #48]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
+; NONEON-NOSVE-NEXT:    str s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stur x8, [sp, #68]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    str s0, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
   %ret = shufflevector <8 x float> %op1, <8 x float> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -312,6 +652,16 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op
 ; CHECK-NEXT:    insr z0.d, d2
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-48]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 48
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    add sp, sp, #48
+; NONEON-NOSVE-NEXT:    ret
   %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %ret
 }
@@ -327,6 +677,24 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, d2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f64:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x1]
+; NONEON-NOSVE-NEXT:    ldr q2, [x0, #16]
+; NONEON-NOSVE-NEXT:    str q0, [sp, #-80]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    stp q2, q1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp]
+; NONEON-NOSVE-NEXT:    ldp d3, d2, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d2, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp d0, d3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -345,6 +713,25 @@ define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) {
 ; CHECK-NEXT:    insr z3.d, d2
 ; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_byone_reverse:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #80
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT:    ldr q2, [x1, #16]
+; NONEON-NOSVE-NEXT:    str q2, [sp]
+; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #40]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    add sp, sp, #80
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
@@ -359,6 +746,13 @@ define void @shuffle_ext_invalid(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: shuffle_ext_invalid:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    ldr q0, [x0, #16]
+; NONEON-NOSVE-NEXT:    ldr q1, [x1]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
   %ret = shufflevector <4 x double> %op1, <4 x double> %op2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
index 6c9c055605668..67cdde718e391 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -11,6 +12,11 @@ define fp128 @test_streaming_compatible_register_mov(fp128 %q0, fp128 %q1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: test_streaming_compatible_register_mov:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    mov v0.16b, v1.16b
+; NONEON-NOSVE-NEXT:    ret
   ret fp128 %q1
 }
 
@@ -20,6 +26,11 @@ define double @fp_zero_constant() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, xzr
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fp_zero_constant:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    fmov d0, xzr
+; NONEON-NOSVE-NEXT:    ret
   ret double 0.0
 }
 
@@ -29,6 +40,12 @@ define <2 x i64> @fixed_vec_zero_constant() {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fixed_vec_zero_constant:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI2_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
+; NONEON-NOSVE-NEXT:    ret
   ret <2 x i64> zeroinitializer
 }
 
@@ -38,5 +55,11 @@ define <2 x double> @fixed_vec_fp_zero_constant() {
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: fixed_vec_fp_zero_constant:
+; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI3_0
+; NONEON-NOSVE-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
+; NONEON-NOSVE-NEXT:    ret
   ret <2 x double> <double 0.0, double 0.0>
 }
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
index 61b67755a3544..7934f831a7e62 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sme2 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
 ; CHECK-LABEL: bfclamp:
@@ -11,3 +11,27 @@ define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bf
 }
 
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_bfclamp_single_x2_f16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d){
+; CHECK-LABEL: test_bfclamp_single_x2_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    bfclamp { z0.h, z1.h }, z2.h, z3.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_bfclamp_single_x4_f16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d, <vscale x 8 x bfloat> %e, <vscale x 8 x bfloat> %f){
+; CHECK-LABEL: test_bfclamp_single_x4_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    bfclamp { z0.h - z3.h }, z4.h, z5.h
+; CHECK-NEXT:    ret
+  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d, <vscale x 8 x bfloat> %e, <vscale x 8 x bfloat> %f)
+  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index ab7cea8dfb778..c9fe89aec8ad9 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4725,94 +4725,102 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 def $x3
-; CHECK-GI-NEXT:    sxtw x8, w3
 ; CHECK-GI-NEXT:    sxtw x9, w1
+; CHECK-GI-NEXT:    sxtw x8, w3
 ; CHECK-GI-NEXT:    ldr d0, [x0]
 ; CHECK-GI-NEXT:    ldr d1, [x2]
 ; CHECK-GI-NEXT:    add x10, x0, x9
 ; CHECK-GI-NEXT:    add x11, x2, x8
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ldr d2, [x10]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    add x12, x11, x8
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ldr d3, [x11]
-; CHECK-GI-NEXT:    ldr d4, [x10]
-; CHECK-GI-NEXT:    ldr d5, [x12]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    add x11, x12, x8
-; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-GI-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-GI-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-GI-NEXT:    uabdl v6.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    uabdl2 v0.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    usubl v0.8h, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    ldr d1, [x10]
-; CHECK-GI-NEXT:    ldr d7, [x11]
+; CHECK-GI-NEXT:    ldr d2, [x11]
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    uabdl v16.4s, v2.4h, v3.4h
-; CHECK-GI-NEXT:    uabdl2 v2.4s, v2.8h, v3.8h
-; CHECK-GI-NEXT:    uabdl v3.4s, v4.4h, v5.4h
-; CHECK-GI-NEXT:    uabdl2 v4.4s, v4.8h, v5.8h
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll v7.8h, v7.8b, #0
-; CHECK-GI-NEXT:    ldr d5, [x10]
-; CHECK-GI-NEXT:    ldr d17, [x11]
+; CHECK-GI-NEXT:    usubl v1.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ldr d3, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    ldr d2, [x10]
 ; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ldr d6, [x11]
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    add v0.4s, v6.4s, v0.4s
-; CHECK-GI-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-GI-NEXT:    ushll v17.8h, v17.8b, #0
-; CHECK-GI-NEXT:    add v2.4s, v16.4s, v2.4s
-; CHECK-GI-NEXT:    add v3.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    uabdl v4.4s, v1.4h, v7.4h
-; CHECK-GI-NEXT:    uabdl2 v1.4s, v1.8h, v7.8h
-; CHECK-GI-NEXT:    ldr d7, [x10]
+; CHECK-GI-NEXT:    usubl v3.8h, v3.8b, v4.8b
+; CHECK-GI-NEXT:    abs v5.4s, v5.4s
+; CHECK-GI-NEXT:    abs v0.4s, v0.4s
+; CHECK-GI-NEXT:    ldr d4, [x10]
 ; CHECK-GI-NEXT:    ldr d16, [x11]
+; CHECK-GI-NEXT:    abs v7.4s, v7.4s
+; CHECK-GI-NEXT:    abs v1.4s, v1.4s
 ; CHECK-GI-NEXT:    add x10, x10, x9
 ; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    ldr d18, [x10]
-; CHECK-GI-NEXT:    ldr d20, [x10, x9]
-; CHECK-GI-NEXT:    ldr d19, [x11]
-; CHECK-GI-NEXT:    ldr d21, [x11, x8]
-; CHECK-GI-NEXT:    uabdl v6.4s, v5.4h, v17.4h
-; CHECK-GI-NEXT:    ushll v7.8h, v7.8b, #0
-; CHECK-GI-NEXT:    ushll v16.8h, v16.8b, #0
-; CHECK-GI-NEXT:    uabdl2 v5.4s, v5.8h, v17.8h
-; CHECK-GI-NEXT:    ushll v17.8h, v18.8b, #0
-; CHECK-GI-NEXT:    ushll v18.8h, v19.8b, #0
-; CHECK-GI-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-GI-NEXT:    ushll v4.8h, v20.8b, #0
-; CHECK-GI-NEXT:    ushll v19.8h, v21.8b, #0
-; CHECK-GI-NEXT:    addv s2, v2.4s
+; CHECK-GI-NEXT:    usubl v2.8h, v2.8b, v6.8b
+; CHECK-GI-NEXT:    ldr d6, [x10]
+; CHECK-GI-NEXT:    ldr d17, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    usubl v4.8h, v4.8b, v16.8b
+; CHECK-GI-NEXT:    sshll v16.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s
+; CHECK-GI-NEXT:    ldr d5, [x10]
+; CHECK-GI-NEXT:    ldr d7, [x11]
+; CHECK-GI-NEXT:    sshll v18.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    usubl v6.8h, v6.8b, v17.8b
+; CHECK-GI-NEXT:    ldr d17, [x11, x8]
+; CHECK-GI-NEXT:    sshll v19.4s, v4.4h, #0
+; CHECK-GI-NEXT:    usubl v5.8h, v5.8b, v7.8b
+; CHECK-GI-NEXT:    ldr d7, [x10, x9]
+; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-GI-NEXT:    abs v16.4s, v16.4s
+; CHECK-GI-NEXT:    abs v3.4s, v3.4s
+; CHECK-GI-NEXT:    abs v18.4s, v18.4s
+; CHECK-GI-NEXT:    abs v2.4s, v2.4s
+; CHECK-GI-NEXT:    usubl v7.8h, v7.8b, v17.8b
+; CHECK-GI-NEXT:    sshll v17.4s, v6.4h, #0
+; CHECK-GI-NEXT:    sshll2 v6.4s, v6.8h, #0
+; CHECK-GI-NEXT:    abs v19.4s, v19.4s
+; CHECK-GI-NEXT:    abs v4.4s, v4.4s
+; CHECK-GI-NEXT:    add v3.4s, v16.4s, v3.4s
+; CHECK-GI-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-GI-NEXT:    add v2.4s, v18.4s, v2.4s
+; CHECK-GI-NEXT:    abs v17.4s, v17.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    abs v6.4s, v6.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    add v4.4s, v19.4s, v4.4s
 ; CHECK-GI-NEXT:    addv s3, v3.4s
-; CHECK-GI-NEXT:    uabdl v20.4s, v7.4h, v16.4h
-; CHECK-GI-NEXT:    uabdl2 v7.4s, v7.8h, v16.8h
-; CHECK-GI-NEXT:    add v5.4s, v6.4s, v5.4s
-; CHECK-GI-NEXT:    uabdl v6.4s, v17.4h, v18.4h
-; CHECK-GI-NEXT:    uabdl2 v16.4s, v17.8h, v18.8h
-; CHECK-GI-NEXT:    uabdl v17.4s, v4.4h, v19.4h
-; CHECK-GI-NEXT:    uabdl2 v4.4s, v4.8h, v19.8h
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    sshll v18.4s, v7.4h, #0
+; CHECK-GI-NEXT:    sshll2 v7.4s, v7.8h, #0
+; CHECK-GI-NEXT:    abs v16.4s, v16.4s
+; CHECK-GI-NEXT:    abs v5.4s, v5.4s
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    add v6.4s, v17.4s, v6.4s
+; CHECK-GI-NEXT:    addv s2, v2.4s
 ; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    addv s4, v4.4s
 ; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    add v7.4s, v20.4s, v7.4s
-; CHECK-GI-NEXT:    add v0.4s, v17.4s, v4.4s
-; CHECK-GI-NEXT:    addv s4, v5.4s
-; CHECK-GI-NEXT:    add v2.4s, v6.4s, v16.4s
+; CHECK-GI-NEXT:    abs v18.4s, v18.4s
+; CHECK-GI-NEXT:    abs v7.4s, v7.4s
+; CHECK-GI-NEXT:    add v1.4s, v16.4s, v5.4s
 ; CHECK-GI-NEXT:    add w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    addv s3, v6.4s
+; CHECK-GI-NEXT:    fmov w9, s2
 ; CHECK-GI-NEXT:    add w8, w10, w8
-; CHECK-GI-NEXT:    addv s3, v7.4s
-; CHECK-GI-NEXT:    addv s1, v2.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    add w8, w9, w8
-; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    fmov w10, s4
+; CHECK-GI-NEXT:    add v0.4s, v18.4s, v7.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
 ; CHECK-GI-NEXT:    add w8, w9, w8
 ; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    add w8, w10, w8
+; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    add w8, w9, w8
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w8, w9, w8
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 50423c59eabe9..526d5c946ec7f 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -108,7 +108,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
 }
 
 ; no-op
-; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
+; HSA-LABEL: {{^}}use_constant_to_flat_addrspacecast:
 ; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
@@ -119,7 +119,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
   ret void
 }
 
-; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast:
+; HSA-LABEL: {{^}}use_constant_to_global_addrspacecast:
 ; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
 ; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 9d4f9434aa314..1a0fda3d54d3f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,7 +142,8 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; GCN-NEXT:      '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; SDAG-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GISEL-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01cb{{$}}
 ; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -156,10 +157,10 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; SDAG-NEXT:        .sgpr_count:     0x25{{$}}
-; GISEL-NEXT:        .sgpr_count:     0x26{{$}}
+; GISEL-NEXT:        .sgpr_count:     0x27{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; SDAG-NEXT:        .vgpr_count:     0x3{{$}}
-; GISEL-NEXT:        .vgpr_count:     0x4{{$}}
+; GISEL-NEXT:        .vgpr_count:     0x5{{$}}
 ; GCN-NEXT:      multiple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x24{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 29621a0477418..1151bde02ef62 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -4,7 +4,7 @@
 
 ---
 
-# GCN-label: name: vop3
+# GCN-LABEL: name: vop3
 # GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
 # GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
 # GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
@@ -37,7 +37,7 @@ body:             |
 ...
 ---
 
-# GCN-label: name: vop3_sgpr_src1
+# GCN-LABEL: name: vop3_sgpr_src1
 # GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
 # GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
 # GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
@@ -81,7 +81,7 @@ body:             |
 ---
 
 # Regression test for src_modifiers on base u16 opcode
-# GCN-label: name: vop3_u16
+# GCN-LABEL: name: vop3_u16
 # GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
 # GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec
 # GCN: %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
@@ -205,7 +205,7 @@ body:             |
 ...
 
 # do not combine, dpp arg used twice
-# GCN-label: name: dpp_arg_twice
+# GCN-LABEL: name: dpp_arg_twice
 # GCN: %4:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %3, 2, %3, 1, 2, implicit $mode, implicit $exec
 # GCN: %6:vgpr_32 = V_FMA_F32_e64 2, %5, 2, %1, 2, %5, 1, 2, implicit $mode, implicit $exec
 # GCN: %8:vgpr_32 = V_FMA_F32_e64 2, %7, 2, %7, 2, %1, 1, 2, implicit $mode, implicit $exec
@@ -231,7 +231,7 @@ body:             |
 ...
 
 # when the dpp source isn't a src0 operand the operation should be commuted if possible
-# GCN-label: name: dpp_commute_e64
+# GCN-LABEL: name: dpp_commute_e64
 # GCN: %4:vgpr_32  = V_MUL_U32_U24_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec
 # GCN: %7:vgpr_32 = V_FMA_F32_e64_dpp %5, 2, %0, 1, %1, 2, %1, 1, 2, 1, 15, 15, 1, implicit $mode, implicit $exec
 # GCN: %10:vgpr_32 = V_SUBREV_U32_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 9690e126dfcfc..3ec36f03a48aa 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -3249,3 +3249,209 @@ define double @v_fmaximum3_f64_const1_const2(double %a) {
   %max1 = call double @llvm.maximum.f64(double %max0, double 16.0)
   ret double %max1
 }
+
+define <2 x float> @v_no_fmaximum3_f32__multi_use(float %a, float %b, float %c) {
+; GFX12-LABEL: v_no_fmaximum3_f32__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f32 v1, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fmaximum3_f32__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  %insert.0 = insertelement <2 x float> poison, float %max0, i32 0
+  %insert.1 = insertelement <2 x float> %insert.0, float %max1, i32 1
+  ret <2 x float> %insert.1
+}
+
+define amdgpu_ps <2 x i32> @s_no_fmaximum3_f32__multi_use(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_no_fmaximum3_f32__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    s_maximum_f32 s1, s0, s2
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_no_fmaximum3_f32__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  %cast0 = bitcast float %max0 to i32
+  %cast1 = bitcast float %max1 to i32
+  %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0)
+  %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
+; GFX12-LABEL: v_no_fmaximum3_f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f16 v1, v0, v2
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fmaximum3_f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  %insert.0 = insertelement <2 x half> poison, half %max0, i32 0
+  %insert.1 = insertelement <2 x half> %insert.0, half %max1, i32 1
+  ret <2 x half> %insert.1
+}
+
+define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half inreg %b, half inreg %c) {
+; GFX12-LABEL: s_no_fmaximum3_f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT:    s_maximum_f16 s1, s0, s2
+; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX12-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_no_fmaximum3_f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  %cast0 = bitcast half %max0 to i16
+  %cast1 = bitcast half %max1 to i16
+  %ext0 = zext i16 %cast0 to i32
+  %ext1 = zext i16 %cast1 to i32
+  %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext0)
+  %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_no_fmaximum3_v2f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fmaximum3_v2f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v1, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.f16(<2 x half> %max0, <2 x half> %c)
+  %concat = shufflevector <2 x half> %max0, <2 x half> %max1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x half> %concat
+}
+
+define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double %c) {
+; GFX12-LABEL: v_no_fmaximum3_f64__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fmaximum3_f64__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  %insert.0 = insertelement <2 x double> poison, double %max0, i32 0
+  %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1
+  ret <2 x double> %insert.1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 7481fff251d89..0e0b73b88d2dc 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -3249,3 +3249,209 @@ define double @v_fminimum3_f64_const1_const2(double %a) {
   %max1 = call double @llvm.minimum.f64(double %max0, double 16.0)
   ret double %max1
 }
+
+define <2 x float> @v_no_fminimum3_f32__multi_use(float %a, float %b, float %c) {
+; GFX12-LABEL: v_no_fminimum3_f32__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f32 v1, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fminimum3_f32__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  %insert.0 = insertelement <2 x float> poison, float %max0, i32 0
+  %insert.1 = insertelement <2 x float> %insert.0, float %max1, i32 1
+  ret <2 x float> %insert.1
+}
+
+define amdgpu_ps <2 x i32> @s_no_fminimum3_f32__multi_use(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_no_fminimum3_f32__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    s_minimum_f32 s1, s0, s2
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_no_fminimum3_f32__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  %cast0 = bitcast float %max0 to i32
+  %cast1 = bitcast float %max1 to i32
+  %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0)
+  %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
+; GFX12-LABEL: v_no_fminimum3_f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f16 v1, v0, v2
+; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fminimum3_f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  %insert.0 = insertelement <2 x half> poison, half %max0, i32 0
+  %insert.1 = insertelement <2 x half> %insert.0, half %max1, i32 1
+  ret <2 x half> %insert.1
+}
+
+define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half inreg %b, half inreg %c) {
+; GFX12-LABEL: s_no_fminimum3_f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT:    s_minimum_f16 s1, s0, s2
+; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX12-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_no_fminimum3_f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  %cast0 = bitcast half %max0 to i16
+  %cast1 = bitcast half %max1 to i16
+  %ext0 = zext i16 %cast0 to i32
+  %ext1 = zext i16 %cast1 to i32
+  %readfirstlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext0)
+  %readfirstlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %ext1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readfirstlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readfirstlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_no_fminimum3_v2f16__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fminimum3_v2f16__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v1, v1, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.f16(<2 x half> %max0, <2 x half> %c)
+  %concat = shufflevector <2 x half> %max0, <2 x half> %max1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x half> %concat
+}
+
+define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double %c) {
+; GFX12-LABEL: v_no_fminimum3_f64__multi_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_no_fminimum3_f64__multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  %insert.0 = insertelement <2 x double> poison, double %max0, i32 0
+  %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1
+  ret <2 x double> %insert.1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index 9547f08d3eba6..1429251fc6421 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_kernel void @workgroup_ids_kernel() {
 ; GFX9-LABEL: workgroup_ids_kernel:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -72,27 +72,20 @@ define amdgpu_kernel void @workgroup_ids_kernel() {
 define amdgpu_kernel void @caller() {
 ; GFX9-SDAG-LABEL: caller:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-SDAG-NEXT:    s_mov_b32 s38, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-SDAG-NEXT:    s_add_u32 s36, s36, s7
-; GFX9-SDAG-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-SDAG-NEXT:    s_add_u32 s8, s2, 36
-; GFX9-SDAG-NEXT:    s_addc_u32 s9, s3, 0
-; GFX9-SDAG-NEXT:    s_getpc_b64 s[2:3]
-; GFX9-SDAG-NEXT:    s_add_u32 s2, s2, callee@gotpcrel32@lo+4
-; GFX9-SDAG-NEXT:    s_addc_u32 s3, s3, callee@gotpcrel32@hi+12
-; GFX9-SDAG-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9-SDAG-NEXT:    s_add_u32 flat_scratch_lo, s10, s13
+; GFX9-SDAG-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s13
+; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX9-SDAG-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-SDAG-NEXT:    s_add_u32 s8, s8, callee@gotpcrel32@lo+4
+; GFX9-SDAG-NEXT:    s_addc_u32 s9, s9, callee@gotpcrel32@hi+12
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[14:15], s[8:9], 0x0
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX9-SDAG-NEXT:    s_mov_b32 s12, s6
-; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[14:15]
@@ -100,27 +93,20 @@ define amdgpu_kernel void @caller() {
 ;
 ; GFX9-GISEL-LABEL: caller:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-GISEL-NEXT:    s_mov_b32 s38, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-GISEL-NEXT:    s_add_u32 s36, s36, s7
-; GFX9-GISEL-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-GISEL-NEXT:    s_add_u32 s8, s2, 36
-; GFX9-GISEL-NEXT:    s_addc_u32 s9, s3, 0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-GISEL-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
-; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x0
+; GFX9-GISEL-NEXT:    s_add_u32 flat_scratch_lo, s10, s13
+; GFX9-GISEL-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, s13
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX9-GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-GISEL-NEXT:    s_add_u32 s8, s8, callee@gotpcrel32@lo+4
+; GFX9-GISEL-NEXT:    s_addc_u32 s9, s9, callee@gotpcrel32@hi+12
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[14:15], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-GISEL-NEXT:    s_mov_b32 s12, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[14:15]
@@ -128,81 +114,61 @@ define amdgpu_kernel void @caller() {
 ;
 ; GFX9ARCH-SDAG-LABEL: caller:
 ; GFX9ARCH-SDAG:       ; %bb.0:
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s38, -1
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9ARCH-SDAG-NEXT:    s_add_u32 s36, s36, s6
-; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9ARCH-SDAG-NEXT:    s_add_u32 s8, s2, 36
-; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s9, s3, 0
-; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[2:3]
-; GFX9ARCH-SDAG-NEXT:    s_add_u32 s2, s2, callee@gotpcrel32@lo+4
-; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s3, s3, callee@gotpcrel32@hi+12
-; GFX9ARCH-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
-; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 flat_scratch_lo, s10, s12
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s0, s0, s12
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[8:9]
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s8, s8, callee@gotpcrel32@lo+4
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s9, s9, callee@gotpcrel32@hi+12
+; GFX9ARCH-SDAG-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
 ; GFX9ARCH-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9ARCH-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9ARCH-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
 ; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s32, 0
 ; GFX9ARCH-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9ARCH-SDAG-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9ARCH-SDAG-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GFX9ARCH-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9ARCH-GISEL-LABEL: caller:
 ; GFX9ARCH-GISEL:       ; %bb.0:
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s38, -1
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9ARCH-GISEL-NEXT:    s_add_u32 s36, s36, s6
-; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9ARCH-GISEL-NEXT:    s_add_u32 s8, s2, 36
-; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s9, s3, 0
-; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[0:1]
-; GFX9ARCH-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
-; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
-; GFX9ARCH-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 flat_scratch_lo, s10, s12
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s0, s0, s12
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s8, s8, callee@gotpcrel32@lo+4
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s9, s9, callee@gotpcrel32@hi+12
+; GFX9ARCH-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x0
 ; GFX9ARCH-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9ARCH-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9ARCH-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s32, 0
 ; GFX9ARCH-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9ARCH-GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9ARCH-GISEL-NEXT:    s_swappc_b64 s[30:31], s[12:13]
 ; GFX9ARCH-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: caller:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
-; GFX12-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX12-SDAG-NEXT:    s_mov_b32 s7, callee@abs32@hi
-; GFX12-SDAG-NEXT:    s_mov_b32 s6, callee@abs32@lo
-; GFX12-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX12-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
-; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: caller:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
-; GFX12-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, callee@abs32@lo
-; GFX12-GISEL-NEXT:    s_mov_b32 s7, callee@abs32@hi
-; GFX12-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX12-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
-; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX12-GISEL-NEXT:    s_endpgm
+; GFX12-LABEL: caller:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX12-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_add_co_u32 s4, s4, callee@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, callee@gotpcrel32@hi+16
+; GFX12-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
+; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x0
+; GFX12-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX12-NEXT:    s_mov_b32 s32, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   call void @callee(i32 %idx) #0
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index 14fe4e5f48c67..8009f917aef5a 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
@@ -67,62 +67,37 @@ define amdgpu_cs void @_amdgpu_cs_main() {
 }
 
 define amdgpu_cs void @caller() {
-; GFX9-LABEL: caller:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
-; GFX9-NEXT:    s_add_u32 s8, s8, s0
-; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[8:9]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX9-NEXT:    s_mov_b32 s32, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    s_endpgm
-;
 ; GFX9ARCH-SDAG-LABEL: caller:
 ; GFX9ARCH-SDAG:       ; %bb.0:
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s10, -1
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[8:9]
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s8, s0
+; GFX9ARCH-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s5, callee@abs32@hi
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s4, callee@abs32@lo
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9ARCH-SDAG-NEXT:    s_add_u32 s8, s8, s0
 ; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[0:1]
-; GFX9ARCH-SDAG-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
-; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
-; GFX9ARCH-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[0:1], s[8:9]
 ; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s32, 0
-; GFX9ARCH-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9ARCH-SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9ARCH-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9ARCH-GISEL-LABEL: caller:
 ; GFX9ARCH-GISEL:       ; %bb.0:
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s10, -1
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[8:9]
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s8, s0
+; GFX9ARCH-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s4, callee@abs32@lo
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s5, callee@abs32@hi
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9ARCH-GISEL-NEXT:    s_add_u32 s8, s8, s0
 ; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[0:1]
-; GFX9ARCH-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
-; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
-; GFX9ARCH-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[0:1], s[8:9]
-; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
 ; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s32, 0
-; GFX9ARCH-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9ARCH-GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9ARCH-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
new file mode 100644
index 0000000000000..50a3336a7483c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
@@ -0,0 +1,1757 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+; Test if fcmp+select patterns form min/max instructions when allowed
+; by flags.
+
+; TODO: Merge with fmin_legacy.ll/fmax_legacy.ll
+
+define float @v_test_fmin_legacy_ule_f32_safe(float %a, float %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f32_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f32_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f32_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmin_legacy_ule_f32_nnan_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f32_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f32_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f32_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule float %a, %b
+  %val = select nnan i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmin_legacy_ule_f32_nsz_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f32_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f32_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f32_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule float %a, %b
+  %val = select nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmin_legacy_ule_f32_nnan_nsz_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f32_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f32_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f32_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmax_legacy_uge_f32_safe(float %a, float %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f32_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f32_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f32_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmax_legacy_uge_f32_nnan_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge float %a, %b
+  %val = select nnan i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmax_legacy_uge_f32_nsz_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge float %a, %b
+  %val = select nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmax_legacy_uge_f32_nnan_nsz_flag(float %a, float %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define <2 x float> @v_test_fmin_legacy_ule_v2f32_safe(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x float> %a, %b
+  %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x float> %a, %b
+  %val = select nnan <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmin_legacy_ule_v2f32_nsz_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x float> %a, %b
+  %val = select nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x float> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmax_legacy_uge_v2f32_safe(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x float> %a, %b
+  %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x float> %a, %b
+  %val = select nnan <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmax_legacy_uge_v2f32_nsz_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x float> %a, %b
+  %val = select nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag(<2 x float> %a, <2 x float> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_nlt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x float> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %val
+}
+
+define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule half %a, %b
+  %val = select i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule half %a, %b
+  %val = select nnan i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule half %a, %b
+  %val = select nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmin_legacy_ule_f16_nnan_nsz_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule half %a, %b
+  %val = select nnan nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge half %a, %b
+  %val = select i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge half %a, %b
+  %val = select nnan i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge half %a, %b
+  %val = select nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define half @v_test_fmax_legacy_uge_f16_nnan_nsz_flag(half %a, half %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge half %a, %b
+  %val = select nnan nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
+define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x half> %a, %b
+  %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x half> %a, %b
+  %val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x half> %a, %b
+  %val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x half> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x half> %a, %b
+  %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x half> %a, %b
+  %val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x half> %a, %b
+  %val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag(<2 x half> %a, <2 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <2 x half> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
+define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <4 x half> %a, %b
+  %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <4 x half> %a, %b
+  %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <4 x half> %a, %b
+  %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_min_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <4 x half> %a, %b
+  %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_safe:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_safe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <4 x half> %a, %b
+  %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <4 x half> %a, %b
+  %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <4 x half> %a, %b
+  %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag(<4 x half> %a, <4 x half> %b) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX7-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
+; GFX7-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
+; GFX7-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
+; GFX7-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp uge <4 x half> %a, %b
+  %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
+define float @v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs(float %arg0, float %arg1) {
+; GFX7-LABEL: v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_add_f32 v0, v0, v0 :: v_dual_add_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a = fadd nnan float %arg0, %arg0
+  %b = fadd nnan float %arg1, %arg1
+  %cmp = fcmp ule float %a, %b
+  %val = select nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
+
+define float @v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs(float %arg0, float %arg1) {
+; GFX7-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_add_f32 v0, v0, v0 :: v_dual_add_f32 v1, v1, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a = fadd nnan float %arg0, %arg0
+  %b = fadd nnan float %arg1, %arg1
+  %cmp = fcmp uge float %a, %b
+  %val = select nsz i1 %cmp, float %a, float %b
+  ret float %val
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index bfc249e9081d2..340f0cdd5d5d0 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -245,6 +245,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI2]], %subreg.sub0, killed [[PHI3]], %subreg.sub1
   ; SI-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; SI-NEXT:   [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.3:
   ; SI-NEXT:   successors: %bb.4(0x80000000)
@@ -261,8 +262,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
   ; SI-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; SI-NEXT:   [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
-  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]]
+  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY6]]
   ; SI-NEXT:   $vgpr0 = COPY killed [[PHI5]]
   ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -282,6 +282,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1
   ; SI-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; SI-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.7:
   ; SI-NEXT:   successors: %bb.8(0x80000000)
@@ -298,8 +299,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
   ; SI-NEXT:   successors: %bb.7(0x40000000), %bb.9(0x40000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; SI-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
-  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]]
+  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY9]]
   ; SI-NEXT:   $vgpr0 = COPY killed [[PHI7]]
   ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -367,6 +367,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI1]], %subreg.sub0, killed [[PHI2]], %subreg.sub1
   ; SI-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; SI-NEXT:   [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.3:
   ; SI-NEXT:   successors: %bb.4(0x80000000)
@@ -382,8 +383,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
   ; SI-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; SI-NEXT:   [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
-  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]]
+  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY6]]
   ; SI-NEXT:   $vgpr0 = COPY [[COPY4]]
   ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -403,6 +403,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1
   ; SI-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; SI-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.7:
   ; SI-NEXT:   successors: %bb.8(0x80000000)
@@ -418,8 +419,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
   ; SI-NEXT:   successors: %bb.7(0x40000000), %bb.9(0x40000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; SI-NEXT:   [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103
-  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]]
+  ; SI-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY9]]
   ; SI-NEXT:   $vgpr0 = COPY [[COPY4]]
   ; SI-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0
   ; SI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 4f2e63b5f2467..c53fb2f330a79 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -439,3 +439,539 @@ define void @test_old_store_64bit(ptr %p, i64 %v) {
   store atomic i64 %v, ptr %p seq_cst, align 8
   ret void
 }
+
+define half @load_atomic_f16__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_f16__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    ldrh r0, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_f16__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    ldrh r0, [r0]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: load_atomic_f16__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    ldrh r0, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: load_atomic_f16__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    movs r1, #0
+; THUMBONE-NEXT:    mov r2, r1
+; THUMBONE-NEXT:    bl __sync_val_compare_and_swap_2
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_f16__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r1, #5
+; ARMV4-NEXT:    bl __atomic_load_2
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_f16__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    ldrh r0, [r0]
+; ARMV6-NEXT:    mov r1, #0
+; ARMV6-NEXT:    mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: load_atomic_f16__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    ldrh r0, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  %val = load atomic half, ptr %ptr seq_cst, align 2
+  ret half %val
+}
+
+define bfloat @load_atomic_bf16__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_bf16__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    ldrh r0, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_bf16__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    ldrh r0, [r0]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: load_atomic_bf16__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    ldrh r0, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: load_atomic_bf16__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    movs r1, #0
+; THUMBONE-NEXT:    mov r2, r1
+; THUMBONE-NEXT:    bl __sync_val_compare_and_swap_2
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_bf16__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r1, #5
+; ARMV4-NEXT:    bl __atomic_load_2
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_bf16__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    ldrh r0, [r0]
+; ARMV6-NEXT:    mov r1, #0
+; ARMV6-NEXT:    mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: load_atomic_bf16__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    ldrh r0, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  %val = load atomic bfloat, ptr %ptr seq_cst, align 2
+  ret bfloat %val
+}
+
+define float @load_atomic_f32__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_f32__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    ldr r0, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_f32__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    ldr r0, [r0]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    vmov s0, r0
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: load_atomic_f32__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    ldr r0, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: load_atomic_f32__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    movs r1, #0
+; THUMBONE-NEXT:    mov r2, r1
+; THUMBONE-NEXT:    bl __sync_val_compare_and_swap_4
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_f32__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r1, #5
+; ARMV4-NEXT:    bl __atomic_load_4
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_f32__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    ldr r0, [r0]
+; ARMV6-NEXT:    mov r1, #0
+; ARMV6-NEXT:    mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: load_atomic_f32__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    ldr r0, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  %val = load atomic float, ptr %ptr seq_cst, align 4
+  ret float %val
+}
+
+define double @load_atomic_f64__seq_cst(ptr %ptr) {
+; ARM-LABEL: load_atomic_f64__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    ldrexd r0, r1, [r0]
+; ARM-NEXT:    clrex
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: load_atomic_f64__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    ldrexd r2, r3, [r0]
+; ARMOPTNONE-NEXT:    mov r1, r3
+; ARMOPTNONE-NEXT:    mov r0, r2
+; ARMOPTNONE-NEXT:    clrex
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    vmov d16, r0, r1
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: load_atomic_f64__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    ldrexd r0, r1, [r0]
+; THUMBTWO-NEXT:    clrex
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: load_atomic_f64__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    sub sp, #8
+; THUMBONE-NEXT:    movs r2, #0
+; THUMBONE-NEXT:    str r2, [sp]
+; THUMBONE-NEXT:    str r2, [sp, #4]
+; THUMBONE-NEXT:    mov r3, r2
+; THUMBONE-NEXT:    bl __sync_val_compare_and_swap_8
+; THUMBONE-NEXT:    add sp, #8
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: load_atomic_f64__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r1, #5
+; ARMV4-NEXT:    bl __atomic_load_8
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: load_atomic_f64__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    ldrexd r0, r1, [r0]
+; ARMV6-NEXT:    mov r2, #0
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: load_atomic_f64__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    push {r7, lr}
+; THUMBM-NEXT:    movs r1, #5
+; THUMBM-NEXT:    bl __atomic_load_8
+; THUMBM-NEXT:    pop {r7, pc}
+  %val = load atomic double, ptr %ptr seq_cst, align 8
+  ret double %val
+}
+
+define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) {
+; ARM-LABEL: store_atomic_f16__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    strh r1, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: store_atomic_f16__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    sub sp, sp, #4
+; ARMOPTNONE-NEXT:    str r1, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    mov r1, r0
+; ARMOPTNONE-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    vmov s0, r0
+; ARMOPTNONE-NEXT:    vmov r0, s0
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    strh r0, [r1]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    add sp, sp, #4
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: store_atomic_f16__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    strh r1, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: store_atomic_f16__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    bl __sync_lock_test_and_set_2
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_f16__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r2, #5
+; ARMV4-NEXT:    bl __atomic_store_2
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_f16__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    mov r2, #0
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    strh r1, [r0]
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: store_atomic_f16__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    strh r1, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  store atomic half %val1, ptr %ptr seq_cst, align 2
+  ret void
+}
+
+define void @store_atomic_bf16__seq_cst(ptr %ptr, bfloat %val1) {
+; ARM-LABEL: store_atomic_bf16__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    strh r1, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: store_atomic_bf16__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    sub sp, sp, #4
+; ARMOPTNONE-NEXT:    str r1, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    mov r1, r0
+; ARMOPTNONE-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    vmov s0, r0
+; ARMOPTNONE-NEXT:    vmov r0, s0
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    strh r0, [r1]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    add sp, sp, #4
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: store_atomic_bf16__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    strh r1, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: store_atomic_bf16__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    bl __sync_lock_test_and_set_2
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_bf16__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r2, #5
+; ARMV4-NEXT:    bl __atomic_store_2
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_bf16__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    mov r2, #0
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    strh r1, [r0]
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: store_atomic_bf16__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    strh r1, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  store atomic bfloat %val1, ptr %ptr seq_cst, align 2
+  ret void
+}
+
+define void @store_atomic_f32__seq_cst(ptr %ptr, float %val1) {
+; ARM-LABEL: store_atomic_f32__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    str r1, [r0]
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    bx lr
+;
+; ARMOPTNONE-LABEL: store_atomic_f32__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    sub sp, sp, #4
+; ARMOPTNONE-NEXT:    str r1, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    mov r1, r0
+; ARMOPTNONE-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    vmov s0, r0
+; ARMOPTNONE-NEXT:    vmov r0, s0
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    str r0, [r1]
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    add sp, sp, #4
+; ARMOPTNONE-NEXT:    bx lr
+;
+; THUMBTWO-LABEL: store_atomic_f32__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    str r1, [r0]
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: store_atomic_f32__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    bl __sync_lock_test_and_set_4
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_f32__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    mov r2, #5
+; ARMV4-NEXT:    bl __atomic_store_4
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_f32__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    mov r2, #0
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    str r1, [r0]
+; ARMV6-NEXT:    mcr p15, #0, r2, c7, c10, #5
+; ARMV6-NEXT:    bx lr
+;
+; THUMBM-LABEL: store_atomic_f32__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    str r1, [r0]
+; THUMBM-NEXT:    dmb sy
+; THUMBM-NEXT:    bx lr
+  store atomic float %val1, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
+; ARM-LABEL: store_atomic_f64__seq_cst:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    push {r4, r5, lr}
+; ARM-NEXT:    mov r3, r2
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    mov r2, r1
+; ARM-NEXT:  LBB13_1: @ %atomicrmw.start
+; ARM-NEXT:    @ =>This Inner Loop Header: Depth=1
+; ARM-NEXT:    ldrexd r4, r5, [r0]
+; ARM-NEXT:    strexd r1, r2, r3, [r0]
+; ARM-NEXT:    cmp r1, #0
+; ARM-NEXT:    bne LBB13_1
+; ARM-NEXT:  @ %bb.2: @ %atomicrmw.end
+; ARM-NEXT:    dmb ish
+; ARM-NEXT:    pop {r4, r5, pc}
+;
+; ARMOPTNONE-LABEL: store_atomic_f64__seq_cst:
+; ARMOPTNONE:       @ %bb.0:
+; ARMOPTNONE-NEXT:    push {r4, r5, r7, lr}
+; ARMOPTNONE-NEXT:    add r7, sp, #8
+; ARMOPTNONE-NEXT:    push {r8, r10, r11}
+; ARMOPTNONE-NEXT:    sub sp, sp, #20
+; ARMOPTNONE-NEXT:    str r0, [sp] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    vmov d16, r1, r2
+; ARMOPTNONE-NEXT:    vmov r1, r2, d16
+; ARMOPTNONE-NEXT:    str r2, [sp, #4] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    ldr r1, [r0]
+; ARMOPTNONE-NEXT:    ldr r0, [r0, #4]
+; ARMOPTNONE-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    b LBB13_1
+; ARMOPTNONE-NEXT:  LBB13_1: @ %atomicrmw.start
+; ARMOPTNONE-NEXT:    @ =>This Loop Header: Depth=1
+; ARMOPTNONE-NEXT:    @ Child Loop BB13_2 Depth 2
+; ARMOPTNONE-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    ldr r3, [sp] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    ldr r10, [sp, #8] @ 4-byte Reload
+; ARMOPTNONE-NEXT:    @ kill: def $r10 killed $r10 def $r10_r11
+; ARMOPTNONE-NEXT:    mov r11, r0
+; ARMOPTNONE-NEXT:    mov r8, r2
+; ARMOPTNONE-NEXT:    mov r9, r1
+; ARMOPTNONE-NEXT:  LBB13_2: @ %atomicrmw.start
+; ARMOPTNONE-NEXT:    @ Parent Loop BB13_1 Depth=1
+; ARMOPTNONE-NEXT:    @ => This Inner Loop Header: Depth=2
+; ARMOPTNONE-NEXT:    ldrexd r4, r5, [r3]
+; ARMOPTNONE-NEXT:    cmp r4, r8
+; ARMOPTNONE-NEXT:    cmpeq r5, r9
+; ARMOPTNONE-NEXT:    bne LBB13_4
+; ARMOPTNONE-NEXT:  @ %bb.3: @ %atomicrmw.start
+; ARMOPTNONE-NEXT:    @ in Loop: Header=BB13_2 Depth=2
+; ARMOPTNONE-NEXT:    strexd r0, r10, r11, [r3]
+; ARMOPTNONE-NEXT:    cmp r0, #0
+; ARMOPTNONE-NEXT:    bne LBB13_2
+; ARMOPTNONE-NEXT:  LBB13_4: @ %atomicrmw.start
+; ARMOPTNONE-NEXT:    @ in Loop: Header=BB13_1 Depth=1
+; ARMOPTNONE-NEXT:    mov r0, r5
+; ARMOPTNONE-NEXT:    eor r3, r0, r1
+; ARMOPTNONE-NEXT:    mov r1, r4
+; ARMOPTNONE-NEXT:    eor r2, r1, r2
+; ARMOPTNONE-NEXT:    orr r2, r2, r3
+; ARMOPTNONE-NEXT:    cmp r2, #0
+; ARMOPTNONE-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    bne LBB13_1
+; ARMOPTNONE-NEXT:    b LBB13_5
+; ARMOPTNONE-NEXT:  LBB13_5: @ %atomicrmw.end
+; ARMOPTNONE-NEXT:    dmb ish
+; ARMOPTNONE-NEXT:    sub sp, r7, #20
+; ARMOPTNONE-NEXT:    pop {r8, r10, r11}
+; ARMOPTNONE-NEXT:    pop {r4, r5, r7, pc}
+;
+; THUMBTWO-LABEL: store_atomic_f64__seq_cst:
+; THUMBTWO:       @ %bb.0:
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:  LBB13_1: @ %atomicrmw.start
+; THUMBTWO-NEXT:    @ =>This Inner Loop Header: Depth=1
+; THUMBTWO-NEXT:    ldrexd r3, r9, [r0]
+; THUMBTWO-NEXT:    strexd r3, r1, r2, [r0]
+; THUMBTWO-NEXT:    cmp r3, #0
+; THUMBTWO-NEXT:    bne LBB13_1
+; THUMBTWO-NEXT:  @ %bb.2: @ %atomicrmw.end
+; THUMBTWO-NEXT:    dmb ish
+; THUMBTWO-NEXT:    bx lr
+;
+; THUMBONE-LABEL: store_atomic_f64__seq_cst:
+; THUMBONE:       @ %bb.0:
+; THUMBONE-NEXT:    push {r7, lr}
+; THUMBONE-NEXT:    bl __sync_lock_test_and_set_8
+; THUMBONE-NEXT:    pop {r7, pc}
+;
+; ARMV4-LABEL: store_atomic_f64__seq_cst:
+; ARMV4:       @ %bb.0:
+; ARMV4-NEXT:    push {r11, lr}
+; ARMV4-NEXT:    sub sp, sp, #8
+; ARMV4-NEXT:    mov r1, #5
+; ARMV4-NEXT:    str r1, [sp]
+; ARMV4-NEXT:    bl __atomic_store_8
+; ARMV4-NEXT:    add sp, sp, #8
+; ARMV4-NEXT:    pop {r11, lr}
+; ARMV4-NEXT:    mov pc, lr
+;
+; ARMV6-LABEL: store_atomic_f64__seq_cst:
+; ARMV6:       @ %bb.0:
+; ARMV6-NEXT:    push {r4, r5, r11, lr}
+; ARMV6-NEXT:    @ kill: def $r3 killed $r3 killed $r2_r3 def $r2_r3
+; ARMV6-NEXT:    mov r1, #0
+; ARMV6-NEXT:    @ kill: def $r2 killed $r2 killed $r2_r3 def $r2_r3
+; ARMV6-NEXT:    mcr p15, #0, r1, c7, c10, #5
+; ARMV6-NEXT:  .LBB13_1: @ %atomicrmw.start
+; ARMV6-NEXT:    @ =>This Inner Loop Header: Depth=1
+; ARMV6-NEXT:    ldrexd r4, r5, [r0]
+; ARMV6-NEXT:    strexd r1, r2, r3, [r0]
+; ARMV6-NEXT:    cmp r1, #0
+; ARMV6-NEXT:    bne .LBB13_1
+; ARMV6-NEXT:  @ %bb.2: @ %atomicrmw.end
+; ARMV6-NEXT:    mov r0, #0
+; ARMV6-NEXT:    mcr p15, #0, r0, c7, c10, #5
+; ARMV6-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMBM-LABEL: store_atomic_f64__seq_cst:
+; THUMBM:       @ %bb.0:
+; THUMBM-NEXT:    push {r7, lr}
+; THUMBM-NEXT:    sub sp, #8
+; THUMBM-NEXT:    movs r1, #5
+; THUMBM-NEXT:    str r1, [sp]
+; THUMBM-NEXT:    bl __atomic_store_8
+; THUMBM-NEXT:    add sp, #8
+; THUMBM-NEXT:    pop {r7, pc}
+  store atomic double %val1, ptr %ptr seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/AVR/shift.ll b/llvm/test/CodeGen/AVR/shift.ll
index c0abc77c9b14a..55ea509a8a5b6 100644
--- a/llvm/test/CodeGen/AVR/shift.ll
+++ b/llvm/test/CodeGen/AVR/shift.ll
@@ -60,13 +60,13 @@ define i64 @shift_i64_i64(i64 %a, i64 %b) {
 ; CHECK-NEXT:    breq .LBB3_3
 ; CHECK-NEXT:  ; %bb.1: ; %shift.loop.preheader
 ; CHECK-NEXT:    mov r27, r1
-; CHECK-NEXT:    mov r16, r1
-; CHECK-NEXT:    mov r17, r1
+; CHECK-NEXT:    mov r16, r27
+; CHECK-NEXT:    mov r17, r27
 ; CHECK-NEXT:  .LBB3_2: ; %shift.loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov r31, r21
 ; CHECK-NEXT:    lsl r31
-; CHECK-NEXT:    mov r26, r1
+; CHECK-NEXT:    mov r26, r27
 ; CHECK-NEXT:    rol r26
 ; CHECK-NEXT:    lsl r22
 ; CHECK-NEXT:    rol r23
diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll
index 04cdbe9d7e785..ff5bec53acd25 100644
--- a/llvm/test/CodeGen/PowerPC/atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics.ll
@@ -462,3 +462,212 @@ define i64 @and_i64_release(ptr %mem, i64 %operand) {
   %val = atomicrmw and ptr %mem, i64 %operand release
   ret i64 %val
 }
+
+define half @load_atomic_f16__seq_cst(ptr %ptr) {
+; PPC32-LABEL: load_atomic_f16__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    stw r0, 20(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    sync
+; PPC32-NEXT:    lhz r3, 0(r3)
+; PPC32-NEXT:    cmpw cr7, r3, r3
+; PPC32-NEXT:    bne- cr7, .+4
+; PPC32-NEXT:    isync
+; PPC32-NEXT:    bl __gnu_h2f_ieee
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_atomic_f16__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    mflr r0
+; PPC64-NEXT:    stdu r1, -112(r1)
+; PPC64-NEXT:    std r0, 128(r1)
+; PPC64-NEXT:    .cfi_def_cfa_offset 112
+; PPC64-NEXT:    .cfi_offset lr, 16
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    lhz r3, 0(r3)
+; PPC64-NEXT:    cmpd cr7, r3, r3
+; PPC64-NEXT:    bne- cr7, .+4
+; PPC64-NEXT:    isync
+; PPC64-NEXT:    bl __gnu_h2f_ieee
+; PPC64-NEXT:    nop
+; PPC64-NEXT:    addi r1, r1, 112
+; PPC64-NEXT:    ld r0, 16(r1)
+; PPC64-NEXT:    mtlr r0
+; PPC64-NEXT:    blr
+  %val = load atomic half, ptr %ptr seq_cst, align 2
+  ret half %val
+}
+
+; FIXME: bf16_to_fp fails to select
+; define bfloat @load_atomic_bf16__seq_cst(ptr %ptr) {
+;   %val = load atomic bfloat, ptr %ptr seq_cst, align 2
+;   ret bfloat %val
+; }
+
+define float @load_atomic_f32__seq_cst(ptr %ptr) {
+; PPC32-LABEL: load_atomic_f32__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    sync
+; PPC32-NEXT:    lwz r3, 0(r3)
+; PPC32-NEXT:    cmpw cr7, r3, r3
+; PPC32-NEXT:    bne- cr7, .+4
+; PPC32-NEXT:    isync
+; PPC32-NEXT:    stw r3, 12(r1)
+; PPC32-NEXT:    lfs f1, 12(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_atomic_f32__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    lwz r3, 0(r3)
+; PPC64-NEXT:    cmpd cr7, r3, r3
+; PPC64-NEXT:    bne- cr7, .+4
+; PPC64-NEXT:    isync
+; PPC64-NEXT:    stw r3, -4(r1)
+; PPC64-NEXT:    lfs f1, -4(r1)
+; PPC64-NEXT:    blr
+  %val = load atomic float, ptr %ptr seq_cst, align 4
+  ret float %val
+}
+
+define double @load_atomic_f64__seq_cst(ptr %ptr) {
+; PPC32-LABEL: load_atomic_f64__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    stw r0, 20(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    li r4, 5
+; PPC32-NEXT:    bl __atomic_load_8
+; PPC32-NEXT:    stw r4, 12(r1)
+; PPC32-NEXT:    stw r3, 8(r1)
+; PPC32-NEXT:    lfd f1, 8(r1)
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_atomic_f64__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    ld r3, 0(r3)
+; PPC64-NEXT:    cmpd cr7, r3, r3
+; PPC64-NEXT:    bne- cr7, .+4
+; PPC64-NEXT:    isync
+; PPC64-NEXT:    std r3, -8(r1)
+; PPC64-NEXT:    lfd f1, -8(r1)
+; PPC64-NEXT:    blr
+  %val = load atomic double, ptr %ptr seq_cst, align 8
+  ret double %val
+}
+
+define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) {
+; PPC32-LABEL: store_atomic_f16__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    stw r0, 20(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    .cfi_offset r30, -8
+; PPC32-NEXT:    stw r30, 8(r1) # 4-byte Folded Spill
+; PPC32-NEXT:    mr r30, r3
+; PPC32-NEXT:    bl __gnu_f2h_ieee
+; PPC32-NEXT:    sync
+; PPC32-NEXT:    sth r3, 0(r30)
+; PPC32-NEXT:    lwz r30, 8(r1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: store_atomic_f16__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    mflr r0
+; PPC64-NEXT:    stdu r1, -128(r1)
+; PPC64-NEXT:    std r0, 144(r1)
+; PPC64-NEXT:    .cfi_def_cfa_offset 128
+; PPC64-NEXT:    .cfi_offset lr, 16
+; PPC64-NEXT:    .cfi_offset r30, -16
+; PPC64-NEXT:    std r30, 112(r1) # 8-byte Folded Spill
+; PPC64-NEXT:    mr r30, r3
+; PPC64-NEXT:    bl __gnu_f2h_ieee
+; PPC64-NEXT:    nop
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    sth r3, 0(r30)
+; PPC64-NEXT:    ld r30, 112(r1) # 8-byte Folded Reload
+; PPC64-NEXT:    addi r1, r1, 128
+; PPC64-NEXT:    ld r0, 16(r1)
+; PPC64-NEXT:    mtlr r0
+; PPC64-NEXT:    blr
+  store atomic half %val1, ptr %ptr seq_cst, align 2
+  ret void
+}
+
+; FIXME: bf16_to_fp fails to select
+; define void @store_atomic_bf16__seq_cst(ptr %ptr, bfloat %val1) {
+;   store atomic bfloat %val1, ptr %ptr seq_cst, align 2
+;   ret void
+; }
+
+define void @store_atomic_f32__seq_cst(ptr %ptr, float %val1) {
+; PPC32-LABEL: store_atomic_f32__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    stfs f1, 12(r1)
+; PPC32-NEXT:    lwz r4, 12(r1)
+; PPC32-NEXT:    sync
+; PPC32-NEXT:    stw r4, 0(r3)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: store_atomic_f32__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    stfs f1, -4(r1)
+; PPC64-NEXT:    lwz r4, -4(r1)
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    stw r4, 0(r3)
+; PPC64-NEXT:    blr
+  store atomic float %val1, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
+; PPC32-LABEL: store_atomic_f64__seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    stw r0, 20(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    stfd f1, 8(r1)
+; PPC32-NEXT:    li r7, 5
+; PPC32-NEXT:    lwz r5, 8(r1)
+; PPC32-NEXT:    lwz r6, 12(r1)
+; PPC32-NEXT:    bl __atomic_store_8
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: store_atomic_f64__seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    stfd f1, -8(r1)
+; PPC64-NEXT:    ld r4, -8(r1)
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    std r4, 0(r3)
+; PPC64-NEXT:    blr
+  store atomic double %val1, ptr %ptr seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir b/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir
index d8f2b08adaf2f..dc20a1577aa5b 100644
--- a/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir
+++ b/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir
@@ -3,16 +3,16 @@
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
 # RUN:   -run-pass ppc-mi-peepholes %s -o - | FileCheck %s --check-prefix=ALL
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=0,ppc-xtoi-peephole-count=8 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=0-7 \
 # RUN:   | FileCheck %s --check-prefix=ALL
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=3,ppc-xtoi-peephole-count=2 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=3-4 \
 # RUN:   | FileCheck %s --check-prefix=ONE-FIRSTSTORE
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=5,ppc-xtoi-peephole-count=2 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=5-6 \
 # RUN:   | FileCheck %s --check-prefix=ONE-SECONDSTORE
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=3,ppc-xtoi-peephole-count=4 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole=3-6 \
 # RUN:   | FileCheck %s --check-prefix=TWO
 
 ---
diff --git a/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir b/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir
index cf3ff291e26c6..09f7ededa20c6 100644
--- a/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir
+++ b/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir
@@ -3,16 +3,19 @@
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
 # RUN:   -run-pass ppc-mi-peepholes %s -o - | FileCheck %s --check-prefix=ALL
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=0,ppc-per-op-peephole-count=6 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=0-5 \
 # RUN:   | FileCheck %s --check-prefix=ALL
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=3,ppc-per-op-peephole-count=1 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=0-5 \
+# RUN:   | FileCheck %s --check-prefix=ALL
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=3 \
 # RUN:   | FileCheck %s --check-prefix=ONE-FIRST-RLWINM
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=4,ppc-per-op-peephole-count=1 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=4 \
 # RUN:   | FileCheck %s --check-prefix=ONE-SECOND-RLWINM
 # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
-# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=3,ppc-per-op-peephole-count=2 \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole=3-4 \
 # RUN:   | FileCheck %s --check-prefix=TWO
 
 ---
diff --git a/llvm/test/CodeGen/PowerPC/pr62372.ll b/llvm/test/CodeGen/PowerPC/pr62372.ll
new file mode 100644
index 0000000000000..8df236adc92d7
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr62372.ll
@@ -0,0 +1,13 @@
+; RUN: llc -ppc-asm-full-reg-names -mcpu=pwr10 -mtriple powerpc64le-unknown-linux-gnu \
+; RUN: -o - %s | FileCheck %s
+
+@bar = dso_local global i32 0, align 4
+
+define dso_local ptr @foo() #0 {
+entry:
+  ret ptr @bar
+}
+
+attributes #0 = { "use-soft-float"="true" }
+
+; CHECK: paddi r3, 0, bar@PCREL, 1
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
new file mode 100644
index 0000000000000..5c42fefb95b39
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -global-isel-abort=1 < %s 2>&1 | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64 -global-isel -global-isel-abort=1 < %s 2>&1 | FileCheck %s --check-prefixes=RV64
+
+define i2 @bitreverse_i2(i2 %x) {
+; RV32-LABEL: bitreverse_i2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a0, 1
+; RV32-NEXT:    andi a1, a1, 2
+; RV32-NEXT:    andi a0, a0, 3
+; RV32-NEXT:    srli a0, a0, 1
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bitreverse_i2:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 1
+; RV64-NEXT:    andi a1, a1, 2
+; RV64-NEXT:    andi a0, a0, 3
+; RV64-NEXT:    srliw a0, a0, 1
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %rev = call i2 @llvm.bitreverse.i2(i2 %x)
+  ret i2 %rev
+}
+
+define i3 @bitreverse_i3(i3 %x) {
+; RV32-LABEL: bitreverse_i3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a0, 2
+; RV32-NEXT:    andi a1, a1, 4
+; RV32-NEXT:    andi a0, a0, 7
+; RV32-NEXT:    andi a2, a0, 2
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bitreverse_i3:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 2
+; RV64-NEXT:    andi a1, a1, 4
+; RV64-NEXT:    andi a0, a0, 7
+; RV64-NEXT:    andi a2, a0, 2
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    srliw a0, a0, 2
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %rev = call i3 @llvm.bitreverse.i3(i3 %x)
+  ret i3 %rev
+}
+
+define i4 @bitreverse_i4(i4 %x) {
+; RV32-LABEL: bitreverse_i4:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a0, 3
+; RV32-NEXT:    andi a1, a1, 8
+; RV32-NEXT:    slli a2, a0, 1
+; RV32-NEXT:    andi a2, a2, 4
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    andi a0, a0, 15
+; RV32-NEXT:    srli a2, a0, 1
+; RV32-NEXT:    andi a2, a2, 2
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    srli a0, a0, 3
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bitreverse_i4:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 3
+; RV64-NEXT:    andi a1, a1, 8
+; RV64-NEXT:    slli a2, a0, 1
+; RV64-NEXT:    andi a2, a2, 4
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    andi a0, a0, 15
+; RV64-NEXT:    srliw a2, a0, 1
+; RV64-NEXT:    andi a2, a2, 2
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    srliw a0, a0, 3
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %rev = call i4 @llvm.bitreverse.i4(i4 %x)
+  ret i4 %rev
+}
+
+define i7 @bitreverse_i7(i7 %x) {
+; RV32-LABEL: bitreverse_i7:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a0, 6
+; RV32-NEXT:    andi a1, a1, 64
+; RV32-NEXT:    slli a2, a0, 4
+; RV32-NEXT:    andi a2, a2, 32
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    slli a2, a0, 2
+; RV32-NEXT:    andi a2, a2, 16
+; RV32-NEXT:    andi a0, a0, 127
+; RV32-NEXT:    andi a3, a0, 8
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    srli a2, a0, 2
+; RV32-NEXT:    andi a2, a2, 4
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    andi a3, a3, 2
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    srli a0, a0, 6
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bitreverse_i7:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 6
+; RV64-NEXT:    andi a1, a1, 64
+; RV64-NEXT:    slli a2, a0, 4
+; RV64-NEXT:    andi a2, a2, 32
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    slli a2, a0, 2
+; RV64-NEXT:    andi a2, a2, 16
+; RV64-NEXT:    andi a0, a0, 127
+; RV64-NEXT:    andi a3, a0, 8
+; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    srliw a2, a0, 2
+; RV64-NEXT:    andi a2, a2, 4
+; RV64-NEXT:    srliw a3, a0, 4
+; RV64-NEXT:    andi a3, a3, 2
+; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    or a1, a1, a2
+; RV64-NEXT:    srliw a0, a0, 6
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %rev = call i7 @llvm.bitreverse.i7(i7 %x)
+  ret i7 %rev
+}
+
+define i24 @bitreverse_i24(i24 %x) {
+; RV32-LABEL: bitreverse_i24:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a1, a0, 16
+; RV32-NEXT:    lui a2, 4096
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    srli a0, a0, 16
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    lui a1, 1048335
+; RV32-NEXT:    addi a1, a1, 240
+; RV32-NEXT:    and a3, a1, a2
+; RV32-NEXT:    and a3, a0, a3
+; RV32-NEXT:    srli a3, a3, 4
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    lui a1, 1047757
+; RV32-NEXT:    addi a1, a1, -820
+; RV32-NEXT:    and a3, a1, a2
+; RV32-NEXT:    and a3, a0, a3
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    lui a1, 1047211
+; RV32-NEXT:    addi a1, a1, -1366
+; RV32-NEXT:    and a2, a1, a2
+; RV32-NEXT:    and a2, a0, a2
+; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: bitreverse_i24:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a0, 16
+; RV64-NEXT:    lui a2, 4096
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    srliw a0, a0, 16
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    lui a1, 1048335
+; RV64-NEXT:    addi a1, a1, 240
+; RV64-NEXT:    and a3, a1, a2
+; RV64-NEXT:    and a3, a0, a3
+; RV64-NEXT:    srliw a3, a3, 4
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a3, a0
+; RV64-NEXT:    lui a1, 1047757
+; RV64-NEXT:    addi a1, a1, -820
+; RV64-NEXT:    and a3, a1, a2
+; RV64-NEXT:    and a3, a0, a3
+; RV64-NEXT:    srliw a3, a3, 2
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a3, a0
+; RV64-NEXT:    lui a1, 1047211
+; RV64-NEXT:    addiw a1, a1, -1366
+; RV64-NEXT:    and a2, a1, a2
+; RV64-NEXT:    and a2, a0, a2
+; RV64-NEXT:    srliw a2, a2, 1
+; RV64-NEXT:    slliw a0, a0, 1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a2, a0
+; RV64-NEXT:    ret
+  %rev = call i24 @llvm.bitreverse.i24(i24 %x)
+  ret i24 %rev
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir
index 7d05edd3f3413..f96d659782178 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv32.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \
-# RUN:   | FileCheck %s --check-prefix=RV32I
+# RUN:   | FileCheck %s --check-prefixes=CHECK,RV32I
 # RUN: llc -mtriple=riscv32 -mattr=+zbb -run-pass=legalizer %s -o -\
-# RUN:   | FileCheck %s --check-prefix=RV32ZBB
+# RUN:   | FileCheck %s --check-prefixes=CHECK,RV32ZBB
 
 ---
 name:            abs_i8
@@ -124,10 +124,12 @@ body:             |
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[COPY1]], [[C1]](s32)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[ASHR]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[ASHR]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[ASHR1]]
     ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD]], [[ASHR]]
-    ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[ADD2]], [[ASHR1]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[ASHR]]
+    ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY3]], [[ASHR1]]
     ; CHECK-NEXT: $x10 = COPY [[XOR]](s32)
     ; CHECK-NEXT: $x11 = COPY [[XOR1]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir
index 5044514babe54..7625a5c2d568a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv32.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv32 -mattr=+v -global-isel-abort=0 -run-pass=legalizer %s -o - | FileCheck %s
 
 ---
 name:            bitreverse_i8
@@ -248,3 +248,277 @@ body:             |
     PseudoRET implicit $x10, implicit $x11
 
 ...
+---
+name:            bitreverse_i2
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C2]](s32)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
+    ; CHECK-NEXT: $x10 = COPY [[OR]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s2) = G_TRUNC %1(s32)
+    %2:_(s2) = G_BITREVERSE %0
+    %3:_(s32) = G_ANYEXT %2(s2)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i3
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i3
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C2]](s32)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C6]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C5]](s32)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C7]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND4]]
+    ; CHECK-NEXT: $x10 = COPY [[OR1]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s3) = G_TRUNC %1(s32)
+    %2:_(s3) = G_BITREVERSE %0
+    %3:_(s32) = G_ANYEXT %2(s3)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i4
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C5]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C6]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND3]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C8]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[C7]](s32)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C9]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND5]]
+    ; CHECK-NEXT: $x10 = COPY [[OR2]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s4) = G_TRUNC %1(s32)
+    %2:_(s4) = G_BITREVERSE %0
+    %3:_(s32) = G_ANYEXT %2(s4)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i7
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i7
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C4]](s32)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND2]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C7]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C6]](s32)
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C8]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND4]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C10]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C9]](s32)
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C11]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[AND6]]
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C13]]
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[C12]](s32)
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C14]]
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[AND8]]
+    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C16]]
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[C15]](s32)
+    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C17]]
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[AND10]]
+    ; CHECK-NEXT: $x10 = COPY [[OR5]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s7) = G_TRUNC %1(s32)
+    %2:_(s7) = G_BITREVERSE %0
+    %3:_(s32) = G_ANYEXT %2(s7)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i24
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i24
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -986896
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C3]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C4]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C2]](s32)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[OR]], [[C2]](s32)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[AND3]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 -3355444
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C6]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[AND4]], [[C7]]
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C5]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C5]](s32)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C6]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[AND6]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 -5592406
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[OR2]], [[C9]]
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[AND7]], [[C10]]
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[C8]](s32)
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[OR2]], [[C8]](s32)
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C9]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[AND9]]
+    ; CHECK-NEXT: $x10 = COPY [[OR3]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s32) = COPY $x10
+    %0:_(s24) = G_TRUNC %1(s32)
+    %2:_(s24) = G_BITREVERSE %0
+    %3:_(s32) = G_ANYEXT %2(s24)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_v2i4
+body:             |
+  bb.1.entry:
+
+    ; CHECK-LABEL: name: bitreverse_v2i4
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $v8
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s4>) = G_TRUNC [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s4) = G_CONSTANT i4 3
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C]](s4), [[C]](s4)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR]](<2 x s4>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s4) = G_CONSTANT i4 -8
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C1]](s4), [[C1]](s4)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL]], [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C2]](s4), [[C2]](s4)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR2]](<2 x s4>)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s4) = G_CONSTANT i4 4
+    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C3]](s4), [[C3]](s4)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL1]], [[BUILD_VECTOR3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s4>) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C4]](s4), [[C4]](s4)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR4]](<2 x s4>)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s4) = G_CONSTANT i4 2
+    ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C5]](s4), [[C5]](s4)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR]], [[BUILD_VECTOR5]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR]], [[AND2]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s4) = G_CONSTANT i4 3
+    ; CHECK-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C6]](s4), [[C6]](s4)
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR6]](<2 x s4>)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C7]](s4), [[C7]](s4)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR1]], [[BUILD_VECTOR7]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR1]], [[AND3]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s4>) = COPY [[OR2]](<2 x s4>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY1]](<2 x s4>)
+    ; CHECK-NEXT: $v8 = COPY [[ANYEXT]](<2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<2 x s32>) = COPY $v8
+    %0:_(<2 x s4>) = G_TRUNC %1(<2 x s32>)
+    %2:_(<2 x s4>) = G_BITREVERSE %0
+    %3:_(<2 x s32>) = G_ANYEXT %2(<2 x s4>)
+    $v8 = COPY %3(<2 x s32>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
index d147350465166..71583f15cd5cd 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-bitreverse-rv64.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -global-isel-abort=0 -global-isel-abort=0 -run-pass=legalizer %s -o - | FileCheck %s
 
 ---
 name:            bitreverse_i8
@@ -251,3 +251,304 @@ body:             |
     PseudoRET implicit $x10
 
 ...
+---
+name:            bitreverse_i2
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C2]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s2) = G_TRUNC %1(s64)
+    %2:_(s2) = G_BITREVERSE %0
+    %3:_(s64) = G_ANYEXT %2(s2)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i3
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i3
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C2]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C3]](s64)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C4]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C5]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C6]](s64)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C7]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND4]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s3) = G_TRUNC %1(s64)
+    %2:_(s3) = G_BITREVERSE %0
+    %3:_(s64) = G_ANYEXT %2(s3)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i4
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C4]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s64)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C6]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND3]]
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[C8]](s64)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C9]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND5]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR2]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s4) = G_TRUNC %1(s64)
+    %2:_(s4) = G_BITREVERSE %0
+    %3:_(s64) = G_ANYEXT %2(s4)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i7
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i7
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[TRUNC2]], [[C4]](s64)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[AND2]]
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]], [[C6]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C7]](s64)
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C8]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND4]]
+    ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[TRUNC4]], [[C9]]
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C10]](s64)
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C11]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[AND6]]
+    ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[TRUNC5]], [[C12]]
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[C13]](s64)
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C14]]
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[AND8]]
+    ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[TRUNC6]], [[C15]]
+    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[C16]](s64)
+    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C17]]
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[AND10]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s7) = G_TRUNC %1(s64)
+    %2:_(s7) = G_BITREVERSE %0
+    %3:_(s64) = G_ANYEXT %2(s7)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_i24
+body:             |
+  bb.1.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: bitreverse_i24
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C2]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -986896
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C3]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C4]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s64)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[OR]], [[C6]](s64)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C3]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[AND3]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 -3355444
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[AND4]], [[C8]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C9]](s64)
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C10]](s64)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C7]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[AND6]]
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 -5592406
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[OR2]], [[C11]]
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[AND7]], [[C12]]
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[C13]](s64)
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[OR2]], [[C14]](s64)
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C11]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[AND9]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR3]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s24) = G_TRUNC %1(s64)
+    %2:_(s24) = G_BITREVERSE %0
+    %3:_(s64) = G_ANYEXT %2(s24)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            bitreverse_v2i4
+body:             |
+  bb.1.entry:
+
+    ; CHECK-LABEL: name: bitreverse_v2i4
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $v8
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s4>) = G_TRUNC [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s4) = G_CONSTANT i4 3
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C]](s4), [[C]](s4)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR]](<2 x s4>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s4) = G_CONSTANT i4 -8
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C1]](s4), [[C1]](s4)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL]], [[BUILD_VECTOR1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C2]](s4), [[C2]](s4)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s4>) = G_SHL [[TRUNC]], [[BUILD_VECTOR2]](<2 x s4>)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s4) = G_CONSTANT i4 4
+    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C3]](s4), [[C3]](s4)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s4>) = G_AND [[SHL1]], [[BUILD_VECTOR3]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s4>) = G_OR [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C4]](s4), [[C4]](s4)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR4]](<2 x s4>)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s4) = G_CONSTANT i4 2
+    ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C5]](s4), [[C5]](s4)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR]], [[BUILD_VECTOR5]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR]], [[AND2]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s4) = G_CONSTANT i4 3
+    ; CHECK-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C6]](s4), [[C6]](s4)
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s4>) = G_LSHR [[TRUNC]], [[BUILD_VECTOR6]](<2 x s4>)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s4) = G_CONSTANT i4 1
+    ; CHECK-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s4>) = G_BUILD_VECTOR [[C7]](s4), [[C7]](s4)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<2 x s4>) = G_AND [[LSHR1]], [[BUILD_VECTOR7]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(<2 x s4>) = G_OR [[OR1]], [[AND3]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s4>) = COPY [[OR2]](<2 x s4>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<2 x s32>) = G_ANYEXT [[COPY1]](<2 x s4>)
+    ; CHECK-NEXT: $v8 = COPY [[ANYEXT]](<2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<2 x s32>) = COPY $v8
+    %0:_(<2 x s4>) = G_TRUNC %1(<2 x s32>)
+    %2:_(<2 x s4>) = G_BITREVERSE %0
+    %3:_(<2 x s32>) = G_ANYEXT %2(<2 x s4>)
+    $v8 = COPY %3(<2 x s32>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 549d531e829ea..a90c244437a03 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -383,8 +383,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
@@ -442,9 +442,9 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV32M-LABEL: test_cttz_i64:
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    lui a2, 30667
-; RV32M-NEXT:    addi a2, a2, 1329
-; RV32M-NEXT:    lui a3, %hi(.LCPI3_0)
-; RV32M-NEXT:    addi a3, a3, %lo(.LCPI3_0)
+; RV32M-NEXT:    addi a3, a2, 1329
+; RV32M-NEXT:    lui a2, %hi(.LCPI3_0)
+; RV32M-NEXT:    addi a2, a2, %lo(.LCPI3_0)
 ; RV32M-NEXT:    bnez a1, .LBB3_3
 ; RV32M-NEXT:  # %bb.1:
 ; RV32M-NEXT:    li a1, 32
@@ -452,18 +452,18 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV32M-NEXT:  .LBB3_2:
 ; RV32M-NEXT:    neg a1, a0
 ; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    mul a0, a0, a2
+; RV32M-NEXT:    mul a0, a0, a3
 ; RV32M-NEXT:    srli a0, a0, 27
-; RV32M-NEXT:    add a0, a3, a0
+; RV32M-NEXT:    add a0, a2, a0
 ; RV32M-NEXT:    lbu a0, 0(a0)
 ; RV32M-NEXT:    li a1, 0
 ; RV32M-NEXT:    ret
 ; RV32M-NEXT:  .LBB3_3:
 ; RV32M-NEXT:    neg a4, a1
 ; RV32M-NEXT:    and a1, a1, a4
-; RV32M-NEXT:    mul a1, a1, a2
+; RV32M-NEXT:    mul a1, a1, a3
 ; RV32M-NEXT:    srli a1, a1, 27
-; RV32M-NEXT:    add a1, a3, a1
+; RV32M-NEXT:    add a1, a2, a1
 ; RV32M-NEXT:    lbu a1, 0(a1)
 ; RV32M-NEXT:    bnez a0, .LBB3_2
 ; RV32M-NEXT:  .LBB3_4:
@@ -814,8 +814,8 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI7_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI7_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI7_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI7_0)
 ; RV32I-NEXT:    neg a0, s1
 ; RV32I-NEXT:    and a0, s1, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index 9ae30e646fdbf..fe6e20d852d59 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -48,8 +48,8 @@ define signext i32 @ctz_dereferencing_pointer(ptr %b) nounwind {
 ; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI0_0)
-; RV32I-NEXT:    addi s3, a0, %lo(.LCPI0_0)
+; RV32I-NEXT:    lui s3, %hi(.LCPI0_0)
+; RV32I-NEXT:    addi s3, s3, %lo(.LCPI0_0)
 ; RV32I-NEXT:    neg a0, s4
 ; RV32I-NEXT:    and a0, s4, a0
 ; RV32I-NEXT:    mv a1, s1
@@ -511,8 +511,8 @@ define signext i32 @ctz4(i64 %b) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI6_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI6_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI6_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI6_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index eb6ac985287a1..478d2eae9dca2 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -24,31 +24,31 @@ define void @_Z3foov() {
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_49)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_49)
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
-; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_48)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_48)
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs1r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_46)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_46)
-; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vle16.v v10, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_45)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_45)
-; CHECK-NEXT:    vle16.v v14, (a0)
+; CHECK-NEXT:    vle16.v v12, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v12, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v14, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vs2r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_40)
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 3c2e84689c979..62b1549a5d58a 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -389,8 +389,8 @@ define dso_local i32 @load_ga() local_unnamed_addr #0 {
 define dso_local i64 @load_ga_8() nounwind {
 ; RV32I-LABEL: load_ga_8:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lui a0, %hi(ga_8)
-; RV32I-NEXT:    addi a1, a0, %lo(ga_8)
+; RV32I-NEXT:    lui a1, %hi(ga_8)
+; RV32I-NEXT:    addi a1, a1, %lo(ga_8)
 ; RV32I-NEXT:    lw a0, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
new file mode 100644
index 0000000000000..e30bdfb939471
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+declare i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32>)
+
+define i32 @test(ptr %a, i64 %n)  {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:  .LBB0_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vl1re32.v v9, (a0)
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    vredsum.vs v9, v9, v8
+; CHECK-NEXT:    vmv.x.s a3, v9
+; CHECK-NEXT:    addw a3, a3, a3
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    addi a0, a0, 8
+; CHECK-NEXT:    bnez a1, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.inc, %loop ]
+  %sum = phi i32 [ 0, %entry ], [ %sum.inc, %loop ]
+  %idx = getelementptr inbounds ptr, ptr %a, i64 %indvar
+  %data = load <vscale x 2 x i32>, ptr %idx
+  %reduce = tail call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> %data)
+  %sum.inc = add i32 %reduce, %reduce
+  %indvar.inc = add i64 %indvar, 1
+  %cmp = icmp eq i64 %indvar.inc, %n
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret i32 %sum
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
index b45ab135fa1c7..197366e7e05fe 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
@@ -209,8 +209,8 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 7e6c3f9c87d27..f25aa0de89da8 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -199,8 +199,8 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    mv a1, s3
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, a0, %lo(.LCPI3_0)
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
 ; RV32I-NEXT:    neg a0, s2
 ; RV32I-NEXT:    and a0, s2, a0
 ; RV32I-NEXT:    mv a1, s3
diff --git a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
index 42d6dac5b07fa..5ced89c17c420 100644
--- a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
@@ -15,27 +15,30 @@ define void @foo(<vscale x 8 x i8> %0) {
 ; CHECK-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -8
 ; CHECK-NEXT:    .cfi_offset s0, -16
 ; CHECK-NEXT:    .cfi_offset s1, -24
+; CHECK-NEXT:    .cfi_offset s2, -32
+; CHECK-NEXT:    li s0, 0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vi v9, v10, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.x.s s0, v9
+; CHECK-NEXT:    vmv.x.s s1, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.x.s s1, v8
+; CHECK-NEXT:    vmv.x.s s2, v8
 ; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    li a1, 0
-; CHECK-NEXT:    mv a0, s0
-; CHECK-NEXT:    mv a2, s1
-; CHECK-NEXT:    li a3, 0
-; CHECK-NEXT:    li a4, 0
-; CHECK-NEXT:    li a5, 0
-; CHECK-NEXT:    jalr a1
+; CHECK-NEXT:    mv a0, s1
+; CHECK-NEXT:    mv a1, s0
+; CHECK-NEXT:    mv a2, s2
+; CHECK-NEXT:    mv a3, s0
+; CHECK-NEXT:    mv a4, s0
+; CHECK-NEXT:    mv a5, s0
+; CHECK-NEXT:    jalr s0
 ; CHECK-NEXT:    j .LBB0_1
   %2 = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v16i8(<vscale x 8 x i8> undef, <16 x i8> undef, i64 0)
   %3 = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v16i8(<vscale x 8 x i8> undef, <16 x i8> poison, i64 0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 9cb3991f31f94..08b310213d16e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -126,28 +126,28 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    vsaddu.vx v8, v8, a1
-; CHECK-NEXT:    vmsltu.vx v0, v8, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_0)
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    vmsltu.vx v0, v8, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_1)
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v24, v16
+; CHECK-NEXT:    vsaddu.vx v16, v24, a1
+; CHECK-NEXT:    vmsltu.vx v9, v16, a2
 ; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_2)
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vmsltu.vx v10, v16, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v8, 2
+; CHECK-NEXT:    vslideup.vi v0, v9, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v10, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v9
+; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v8, v16, a1
 ; CHECK-NEXT:    vmsltu.vx v16, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
@@ -169,13 +169,13 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v10, v16, a2
+; CHECK-NEXT:    vmsltu.vx v8, v16, a2
 ; CHECK-NEXT:    vsext.vf8 v16, v9
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_2)
 ; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vmsltu.vx v10, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_3)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_3)
 ; CHECK-NEXT:    vle8.v v11, (a0)
@@ -187,10 +187,10 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vmsltu.vx v11, v16, a2
 ; CHECK-NEXT:    vid.v v16
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v0, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_4)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_4)
 ; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    vmsltu.vx v0, v16, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_5)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_5)
 ; CHECK-NEXT:    vle8.v v13, (a0)
@@ -201,27 +201,27 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
 ; CHECK-NEXT:    vmsltu.vx v13, v16, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NEXT:    vslideup.vi v10, v9, 4
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_6)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_6)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 6
+; CHECK-NEXT:    vslideup.vi v10, v11, 6
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v12, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v0, v13, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v9
+; CHECK-NEXT:    vsext.vf8 v16, v8
 ; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
+; CHECK-NEXT:    vmsltu.vx v8, v16, a2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 6
+; CHECK-NEXT:    vslideup.vi v0, v8, 6
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v8, 8
+; CHECK-NEXT:    vslideup.vi v0, v10, 8
 ; CHECK-NEXT:    ret
   %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
   ret <128 x i1> %mask
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index fff280c005b54..df413b878172b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2574,9 +2574,8 @@ define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 ; CHECK-LABEL: vp_ctlz_nxv1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
@@ -2593,9 +2592,8 @@ define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 ; CHECK-ZVBB-LABEL: vp_ctlz_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    li a0, 7
 ; CHECK-ZVBB-NEXT:    vsub.vx v8, v8, a0, v0.t
@@ -2607,9 +2605,8 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
@@ -2624,9 +2621,8 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
 ; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    li a0, 7
 ; CHECK-ZVBB-NEXT:    vsub.vx v8, v8, a0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index e3c53212e91b7..b5cafe410ae8d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -2549,9 +2549,8 @@ define <vscale x 1 x i9> @vp_ctpop_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i
 ; CHECK-LABEL: vp_ctpop_nxv1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
@@ -2576,9 +2575,8 @@ define <vscale x 1 x i9> @vp_ctpop_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv1i9:
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    li a1, 511
-; CHECK-ZVBB-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-ZVBB-NEXT:    vcpop.v v8, v8, v0.t
 ; CHECK-ZVBB-NEXT:    ret
   %v = call <vscale x 1 x i9> @llvm.vp.ctpop.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
index 75747a6674b7b..d8781495abd75 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
@@ -7,7 +7,7 @@
 define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -15,6 +15,7 @@ define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -29,7 +30,7 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.ceil.nxv1f16(<vscale
 define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -37,6 +38,7 @@ define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -51,7 +53,7 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.ceil.nxv2f16(<vscale
 define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -59,6 +61,7 @@ define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -73,7 +76,7 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.ceil.nxv4f16(<vscale
 define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -81,6 +84,7 @@ define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -95,7 +99,7 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.ceil.nxv8f16(<vscale
 define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -103,6 +107,7 @@ define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -117,7 +122,7 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.ceil.nxv16f16(<vscal
 define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -125,6 +130,7 @@ define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -139,7 +145,7 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.ceil.nxv32f16(<vscal
 define <vscale x 1 x float> @ceil_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -147,6 +153,7 @@ define <vscale x 1 x float> @ceil_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -161,7 +168,7 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.ceil.nxv1f32(<vscale
 define <vscale x 2 x float> @ceil_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -169,6 +176,7 @@ define <vscale x 2 x float> @ceil_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -183,7 +191,7 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.ceil.nxv2f32(<vscale
 define <vscale x 4 x float> @ceil_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -191,6 +199,7 @@ define <vscale x 4 x float> @ceil_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -205,7 +214,7 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.ceil.nxv4f32(<vscale
 define <vscale x 8 x float> @ceil_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -213,6 +222,7 @@ define <vscale x 8 x float> @ceil_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -227,7 +237,7 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.ceil.nxv8f32(<vscale
 define <vscale x 16 x float> @ceil_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
@@ -235,6 +245,7 @@ define <vscale x 16 x float> @ceil_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -249,7 +260,7 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.ceil.nxv16f32(<vsca
 define <vscale x 1 x double> @ceil_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -257,6 +268,7 @@ define <vscale x 1 x double> @ceil_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -271,7 +283,7 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.ceil.nxv1f64(<vscal
 define <vscale x 2 x double> @ceil_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -279,6 +291,7 @@ define <vscale x 2 x double> @ceil_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -293,7 +306,7 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.ceil.nxv2f64(<vscal
 define <vscale x 4 x double> @ceil_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -301,6 +314,7 @@ define <vscale x 4 x double> @ceil_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -315,7 +329,7 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.ceil.nxv4f64(<vscal
 define <vscale x 8 x double> @ceil_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -323,6 +337,7 @@ define <vscale x 8 x double> @ceil_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
index 31a9453204457..1df452d8641c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
@@ -7,7 +7,7 @@
 define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -15,6 +15,7 @@ define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -29,7 +30,7 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.floor.nxv1f16(<vscale
 define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -37,6 +38,7 @@ define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -51,7 +53,7 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.floor.nxv2f16(<vscale
 define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -59,6 +61,7 @@ define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -73,7 +76,7 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.floor.nxv4f16(<vscale
 define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -81,6 +84,7 @@ define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -95,7 +99,7 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.floor.nxv8f16(<vscale
 define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -103,6 +107,7 @@ define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -117,7 +122,7 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.floor.nxv16f16(<vsca
 define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: floor_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -125,6 +130,7 @@ define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -139,7 +145,7 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.floor.nxv32f16(<vsca
 define <vscale x 1 x float> @floor_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-LABEL: floor_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -147,6 +153,7 @@ define <vscale x 1 x float> @floor_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -161,7 +168,7 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.floor.nxv1f32(<vscal
 define <vscale x 2 x float> @floor_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-LABEL: floor_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -169,6 +176,7 @@ define <vscale x 2 x float> @floor_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -183,7 +191,7 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.floor.nxv2f32(<vscal
 define <vscale x 4 x float> @floor_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-LABEL: floor_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -191,6 +199,7 @@ define <vscale x 4 x float> @floor_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -205,7 +214,7 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.floor.nxv4f32(<vscal
 define <vscale x 8 x float> @floor_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-LABEL: floor_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -213,6 +222,7 @@ define <vscale x 8 x float> @floor_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -227,7 +237,7 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.floor.nxv8f32(<vscal
 define <vscale x 16 x float> @floor_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-LABEL: floor_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
@@ -235,6 +245,7 @@ define <vscale x 16 x float> @floor_nxv16f32(<vscale x 16 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -249,7 +260,7 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.floor.nxv16f32(<vsc
 define <vscale x 1 x double> @floor_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -257,6 +268,7 @@ define <vscale x 1 x double> @floor_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -271,7 +283,7 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.floor.nxv1f64(<vsca
 define <vscale x 2 x double> @floor_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -279,6 +291,7 @@ define <vscale x 2 x double> @floor_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -293,7 +306,7 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.floor.nxv2f64(<vsca
 define <vscale x 4 x double> @floor_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -301,6 +314,7 @@ define <vscale x 4 x double> @floor_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -315,7 +329,7 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.floor.nxv4f64(<vsca
 define <vscale x 8 x double> @floor_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: floor_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -323,6 +337,7 @@ define <vscale x 8 x double> @floor_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
index 1e93a73ede5d6..404fb72b8abe9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
@@ -7,7 +7,7 @@
 define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -15,6 +15,7 @@ define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -29,7 +30,7 @@ declare <1 x half> @llvm.experimental.constrained.ceil.v1f16(<1 x half>, metadat
 define <2 x half> @ceil_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -37,6 +38,7 @@ define <2 x half> @ceil_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -51,7 +53,7 @@ declare <2 x half> @llvm.experimental.constrained.ceil.v2f16(<2 x half>, metadat
 define <4 x half> @ceil_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -59,6 +61,7 @@ define <4 x half> @ceil_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -73,7 +76,7 @@ declare <4 x half> @llvm.experimental.constrained.ceil.v4f16(<4 x half>, metadat
 define <8 x half> @ceil_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -81,6 +84,7 @@ define <8 x half> @ceil_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -95,7 +99,7 @@ declare <8 x half> @llvm.experimental.constrained.ceil.v8f16(<8 x half>, metadat
 define <16 x half> @ceil_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -103,6 +107,7 @@ define <16 x half> @ceil_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -118,7 +123,7 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -126,6 +131,7 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -140,7 +146,7 @@ declare <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half>, meta
 define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -148,6 +154,7 @@ define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -162,7 +169,7 @@ declare <1 x float> @llvm.experimental.constrained.ceil.v1f32(<1 x float>, metad
 define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -170,6 +177,7 @@ define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -184,7 +192,7 @@ declare <2 x float> @llvm.experimental.constrained.ceil.v2f32(<2 x float>, metad
 define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -192,6 +200,7 @@ define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -206,7 +215,7 @@ declare <4 x float> @llvm.experimental.constrained.ceil.v4f32(<4 x float>, metad
 define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -214,6 +223,7 @@ define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -228,7 +238,7 @@ declare <8 x float> @llvm.experimental.constrained.ceil.v8f32(<8 x float>, metad
 define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp {
 ; CHECK-LABEL: ceil_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -236,6 +246,7 @@ define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -250,7 +261,7 @@ declare <16 x float> @llvm.experimental.constrained.ceil.v16f32(<16 x float>, me
 define <1 x double> @ceil_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -258,6 +269,7 @@ define <1 x double> @ceil_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -272,7 +284,7 @@ declare <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double>, met
 define <2 x double> @ceil_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -280,6 +292,7 @@ define <2 x double> @ceil_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -294,7 +307,7 @@ declare <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double>, met
 define <4 x double> @ceil_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -302,6 +315,7 @@ define <4 x double> @ceil_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -316,7 +330,7 @@ declare <4 x double> @llvm.experimental.constrained.ceil.v4f64(<4 x double>, met
 define <8 x double> @ceil_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: ceil_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -324,6 +338,7 @@ define <8 x double> @ceil_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
index 53018939fc6eb..2319aab370d2d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
@@ -7,7 +7,7 @@
 define <1 x half> @floor_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -15,6 +15,7 @@ define <1 x half> @floor_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -29,7 +30,7 @@ declare <1 x half> @llvm.experimental.constrained.floor.v1f16(<1 x half>, metada
 define <2 x half> @floor_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -37,6 +38,7 @@ define <2 x half> @floor_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -51,7 +53,7 @@ declare <2 x half> @llvm.experimental.constrained.floor.v2f16(<2 x half>, metada
 define <4 x half> @floor_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -59,6 +61,7 @@ define <4 x half> @floor_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -73,7 +76,7 @@ declare <4 x half> @llvm.experimental.constrained.floor.v4f16(<4 x half>, metada
 define <8 x half> @floor_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -81,6 +84,7 @@ define <8 x half> @floor_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -95,7 +99,7 @@ declare <8 x half> @llvm.experimental.constrained.floor.v8f16(<8 x half>, metada
 define <16 x half> @floor_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -103,6 +107,7 @@ define <16 x half> @floor_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -118,7 +123,7 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -126,6 +131,7 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -140,7 +146,7 @@ declare <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half>, met
 define <1 x float> @floor_v1f32(<1 x float> %x) strictfp {
 ; CHECK-LABEL: floor_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -148,6 +154,7 @@ define <1 x float> @floor_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -162,7 +169,7 @@ declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, meta
 define <2 x float> @floor_v2f32(<2 x float> %x) strictfp {
 ; CHECK-LABEL: floor_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -170,6 +177,7 @@ define <2 x float> @floor_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -184,7 +192,7 @@ declare <2 x float> @llvm.experimental.constrained.floor.v2f32(<2 x float>, meta
 define <4 x float> @floor_v4f32(<4 x float> %x) strictfp {
 ; CHECK-LABEL: floor_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -192,6 +200,7 @@ define <4 x float> @floor_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -206,7 +215,7 @@ declare <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float>, meta
 define <8 x float> @floor_v8f32(<8 x float> %x) strictfp {
 ; CHECK-LABEL: floor_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -214,6 +223,7 @@ define <8 x float> @floor_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -228,7 +238,7 @@ declare <8 x float> @llvm.experimental.constrained.floor.v8f32(<8 x float>, meta
 define <16 x float> @floor_v16f32(<16 x float> %x) strictfp {
 ; CHECK-LABEL: floor_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -236,6 +246,7 @@ define <16 x float> @floor_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -250,7 +261,7 @@ declare <16 x float> @llvm.experimental.constrained.floor.v16f32(<16 x float>, m
 define <1 x double> @floor_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -258,6 +269,7 @@ define <1 x double> @floor_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -272,7 +284,7 @@ declare <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double>, me
 define <2 x double> @floor_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -280,6 +292,7 @@ define <2 x double> @floor_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -294,7 +307,7 @@ declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, me
 define <4 x double> @floor_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -302,6 +315,7 @@ define <4 x double> @floor_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -316,7 +330,7 @@ declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, me
 define <8 x double> @floor_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: floor_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -324,6 +338,7 @@ define <8 x double> @floor_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
index 9e9a8b8a4b644..719dd52494284 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
@@ -9,7 +9,7 @@ declare <2 x half> @llvm.experimental.constrained.nearbyint.v2f16(<2 x half>, me
 define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
@@ -32,7 +33,7 @@ declare <4 x half> @llvm.experimental.constrained.nearbyint.v4f16(<4 x half>, me
 define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -40,6 +41,7 @@ define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
@@ -55,7 +57,7 @@ declare <8 x half> @llvm.experimental.constrained.nearbyint.v8f16(<8 x half>, me
 define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -63,6 +65,7 @@ define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
@@ -78,7 +81,7 @@ declare <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(<16 x half>,
 define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -86,6 +89,7 @@ define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -102,7 +106,7 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -110,6 +114,7 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -125,7 +130,7 @@ declare <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float>,
 define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -133,6 +138,7 @@ define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
@@ -148,7 +154,7 @@ declare <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float>,
 define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -156,6 +162,7 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
@@ -171,7 +178,7 @@ declare <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(<8 x float>,
 define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -179,6 +186,7 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
@@ -194,7 +202,7 @@ declare <16 x float> @llvm.experimental.constrained.nearbyint.v16f32(<16 x float
 define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -202,6 +210,7 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
@@ -217,7 +226,7 @@ declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double>
 define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI9_0)(a0)
@@ -225,6 +234,7 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
@@ -240,7 +250,7 @@ declare <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double>
 define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI10_0)(a0)
@@ -248,6 +258,7 @@ define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -263,7 +274,7 @@ declare <8 x double> @llvm.experimental.constrained.nearbyint.v8f64(<8 x double>
 define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -271,6 +282,7 @@ define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
index f189354237ee3..e855d9504ff40 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
@@ -9,7 +9,7 @@
 define <1 x half> @round_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: round_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <1 x half> @round_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -31,7 +32,7 @@ declare <1 x half> @llvm.experimental.constrained.round.v1f16(<1 x half>, metada
 define <2 x half> @round_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: round_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -39,6 +40,7 @@ define <2 x half> @round_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -53,7 +55,7 @@ declare <2 x half> @llvm.experimental.constrained.round.v2f16(<2 x half>, metada
 define <4 x half> @round_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: round_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -61,6 +63,7 @@ define <4 x half> @round_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -75,7 +78,7 @@ declare <4 x half> @llvm.experimental.constrained.round.v4f16(<4 x half>, metada
 define <8 x half> @round_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: round_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -83,6 +86,7 @@ define <8 x half> @round_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -97,7 +101,7 @@ declare <8 x half> @llvm.experimental.constrained.round.v8f16(<8 x half>, metada
 define <16 x half> @round_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: round_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -105,6 +109,7 @@ define <16 x half> @round_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -120,7 +125,7 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: round_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -128,6 +133,7 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -142,7 +148,7 @@ declare <32 x half> @llvm.experimental.constrained.round.v32f16(<32 x half>, met
 define <1 x float> @round_v1f32(<1 x float> %x) strictfp {
 ; CHECK-LABEL: round_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -150,6 +156,7 @@ define <1 x float> @round_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -164,7 +171,7 @@ declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, meta
 define <2 x float> @round_v2f32(<2 x float> %x) strictfp {
 ; CHECK-LABEL: round_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -172,6 +179,7 @@ define <2 x float> @round_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -186,7 +194,7 @@ declare <2 x float> @llvm.experimental.constrained.round.v2f32(<2 x float>, meta
 define <4 x float> @round_v4f32(<4 x float> %x) strictfp {
 ; CHECK-LABEL: round_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -194,6 +202,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -208,7 +217,7 @@ declare <4 x float> @llvm.experimental.constrained.round.v4f32(<4 x float>, meta
 define <8 x float> @round_v8f32(<8 x float> %x) strictfp {
 ; CHECK-LABEL: round_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -216,6 +225,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -230,7 +240,7 @@ declare <8 x float> @llvm.experimental.constrained.round.v8f32(<8 x float>, meta
 define <16 x float> @round_v16f32(<16 x float> %x) strictfp {
 ; CHECK-LABEL: round_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -238,6 +248,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -252,7 +263,7 @@ declare <16 x float> @llvm.experimental.constrained.round.v16f32(<16 x float>, m
 define <1 x double> @round_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: round_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -260,6 +271,7 @@ define <1 x double> @round_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -274,7 +286,7 @@ declare <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double>, me
 define <2 x double> @round_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: round_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -282,6 +294,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -296,7 +309,7 @@ declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, me
 define <4 x double> @round_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: round_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -304,6 +317,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -318,7 +332,7 @@ declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, me
 define <8 x double> @round_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: round_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -326,6 +340,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
index 11920c7c31c98..9976cd2a8ab29 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
@@ -9,7 +9,7 @@
 define <1 x half> @roundeven_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -31,7 +32,7 @@ declare <1 x half> @llvm.experimental.constrained.roundeven.v1f16(<1 x half>, me
 define <2 x half> @roundeven_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -39,6 +40,7 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -53,7 +55,7 @@ declare <2 x half> @llvm.experimental.constrained.roundeven.v2f16(<2 x half>, me
 define <4 x half> @roundeven_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -61,6 +63,7 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -75,7 +78,7 @@ declare <4 x half> @llvm.experimental.constrained.roundeven.v4f16(<4 x half>, me
 define <8 x half> @roundeven_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -83,6 +86,7 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -97,7 +101,7 @@ declare <8 x half> @llvm.experimental.constrained.roundeven.v8f16(<8 x half>, me
 define <16 x half> @roundeven_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -105,6 +109,7 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -120,7 +125,7 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -128,6 +133,7 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -142,7 +148,7 @@ declare <32 x half> @llvm.experimental.constrained.roundeven.v32f16(<32 x half>,
 define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -150,6 +156,7 @@ define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -164,7 +171,7 @@ declare <1 x float> @llvm.experimental.constrained.roundeven.v1f32(<1 x float>,
 define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -172,6 +179,7 @@ define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -186,7 +194,7 @@ declare <2 x float> @llvm.experimental.constrained.roundeven.v2f32(<2 x float>,
 define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -194,6 +202,7 @@ define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -208,7 +217,7 @@ declare <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float>,
 define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -216,6 +225,7 @@ define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -230,7 +240,7 @@ declare <8 x float> @llvm.experimental.constrained.roundeven.v8f32(<8 x float>,
 define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -238,6 +248,7 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -252,7 +263,7 @@ declare <16 x float> @llvm.experimental.constrained.roundeven.v16f32(<16 x float
 define <1 x double> @roundeven_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -260,6 +271,7 @@ define <1 x double> @roundeven_v1f64(<1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -274,7 +286,7 @@ declare <1 x double> @llvm.experimental.constrained.roundeven.v1f64(<1 x double>
 define <2 x double> @roundeven_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -282,6 +294,7 @@ define <2 x double> @roundeven_v2f64(<2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -296,7 +309,7 @@ declare <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double>
 define <4 x double> @roundeven_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -304,6 +317,7 @@ define <4 x double> @roundeven_v4f64(<4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -318,7 +332,7 @@ declare <4 x double> @llvm.experimental.constrained.roundeven.v4f64(<4 x double>
 define <8 x double> @roundeven_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -326,6 +340,7 @@ define <8 x double> @roundeven_v8f64(<8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
index f16581444afca..eac26451d5a8c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
@@ -7,13 +7,14 @@
 define <1 x half> @trunc_v1f16(<1 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
@@ -27,13 +28,14 @@ declare <1 x half> @llvm.experimental.constrained.trunc.v1f16(<1 x half>, metada
 define <2 x half> @trunc_v2f16(<2 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
@@ -47,13 +49,14 @@ declare <2 x half> @llvm.experimental.constrained.trunc.v2f16(<2 x half>, metada
 define <4 x half> @trunc_v4f16(<4 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
@@ -67,13 +70,14 @@ declare <4 x half> @llvm.experimental.constrained.trunc.v4f16(<4 x half>, metada
 define <8 x half> @trunc_v8f16(<8 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
@@ -87,13 +91,14 @@ declare <8 x half> @llvm.experimental.constrained.trunc.v8f16(<8 x half>, metada
 define <16 x half> @trunc_v16f16(<16 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -108,13 +113,14 @@ define <32 x half> @trunc_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -128,13 +134,14 @@ declare <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half>, met
 define <1 x float> @trunc_v1f32(<1 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
@@ -148,13 +155,14 @@ declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, meta
 define <2 x float> @trunc_v2f32(<2 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
@@ -168,13 +176,14 @@ declare <2 x float> @llvm.experimental.constrained.trunc.v2f32(<2 x float>, meta
 define <4 x float> @trunc_v4f32(<4 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
@@ -188,13 +197,14 @@ declare <4 x float> @llvm.experimental.constrained.trunc.v4f32(<4 x float>, meta
 define <8 x float> @trunc_v8f32(<8 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
@@ -208,13 +218,14 @@ declare <8 x float> @llvm.experimental.constrained.trunc.v8f32(<8 x float>, meta
 define <16 x float> @trunc_v16f32(<16 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
@@ -228,13 +239,14 @@ declare <16 x float> @llvm.experimental.constrained.trunc.v16f32(<16 x float>, m
 define <1 x double> @trunc_v1f64(<1 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
@@ -248,13 +260,14 @@ declare <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double>, me
 define <2 x double> @trunc_v2f64(<2 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
@@ -268,13 +281,14 @@ declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, me
 define <4 x double> @trunc_v4f64(<4 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -288,13 +302,14 @@ declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, me
 define <8 x double> @trunc_v8f64(<8 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index 79c36a629465d..f4d7074c7f6b2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -3459,6 +3459,8 @@ define void @mulhu_v4i64(ptr %x) {
 ; RV64-NEXT:    lui a1, %hi(.LCPI184_0)
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI184_0)
 ; RV64-NEXT:    vle64.v v10, (a1)
+; RV64-NEXT:    vmulhu.vv v10, v8, v10
+; RV64-NEXT:    vsub.vv v8, v8, v10
 ; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    slli a1, a1, 63
 ; RV64-NEXT:    vmv.s.x v12, a1
@@ -3466,8 +3468,6 @@ define void @mulhu_v4i64(ptr %x) {
 ; RV64-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; RV64-NEXT:    vslideup.vi v14, v12, 2
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmulhu.vv v10, v8, v10
-; RV64-NEXT:    vsub.vv v8, v8, v10
 ; RV64-NEXT:    vmulhu.vv v8, v8, v14
 ; RV64-NEXT:    vadd.vv v8, v8, v10
 ; RV64-NEXT:    lui a1, 12320
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 178a920169ad9..bc3e135a588a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -159,17 +159,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 82
+; RV32-NEXT:    li a3, 80
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 82 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 80 * vlenb
 ; RV32-NEXT:    addi a3, a1, 256
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v16, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 57
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 6
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -177,26 +176,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vslideup.vi v8, v16, 4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 41
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 12
-; RV32-NEXT:    vmv.s.x v1, a4
+; RV32-NEXT:    vmv.s.x v0, a4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 6
-; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    li a5, 56
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vmv1r.v v3, v0
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v8, v16, 10, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 45
+; RV32-NEXT:    li a5, 44
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
@@ -206,8 +205,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 5
-; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    slli a4, a4, 5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
@@ -216,21 +214,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a5, 1
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a6, 25
+; RV32-NEXT:    li a6, 24
 ; RV32-NEXT:    mul a4, a4, a6
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 73
+; RV32-NEXT:    li a4, 72
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v24, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -238,27 +236,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a5, -64
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -266,259 +263,257 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    li a3, 44
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v12, v8, 2
+; RV32-NEXT:    vmv1r.v v8, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v1, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vs1r.v v3, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v12, v16, 8, v0.t
-; RV32-NEXT:    vmv.v.v v20, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_3)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_3)
-; RV32-NEXT:    lui a4, %hi(.LCPI6_4)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    vle16.v v16, (a3)
-; RV32-NEXT:    addi a1, a4, %lo(.LCPI6_4)
+; RV32-NEXT:    vle16.v v0, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v2, (a1)
+; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v24, v8, v4
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v4, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v20, v24
+; RV32-NEXT:    vmv.v.v v12, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v16, v24, v2
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vrgatherei16.vv v12, v24, v10
+; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v16, v8, 6, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v12, v24, 6, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    vle16.v v4, (a3)
-; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vle16.v v12, (a1)
+; RV32-NEXT:    vle16.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 960
+; RV32-NEXT:    vmv.s.x v8, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v0, v12
+; RV32-NEXT:    vmv1r.v v3, v8
+; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
+; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_8)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_8)
-; RV32-NEXT:    lui a4, %hi(.LCPI6_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    addi a1, a4, %lo(.LCPI6_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_9)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v24, (a3)
-; RV32-NEXT:    vle16.v v28, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v4, v0, v8
+; RV32-NEXT:    vrgatherei16.vv v12, v24, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v4, v8, 4, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv4r.v v24, v16
+; RV32-NEXT:    vslideup.vi v12, v16, 4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
+; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    lui a1, 15
 ; RV32-NEXT:    vmv.s.x v3, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v16, 6
+; RV32-NEXT:    vslideup.vi v8, v16, 6
 ; RV32-NEXT:    vmv1r.v v0, v3
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v12, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_11)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_11)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_12)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_12)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    vle16.v v12, (a3)
+; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    li a1, 1008
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 2
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_13)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_13)
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_14)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_14)
-; RV32-NEXT:    lui a4, %hi(.LCPI6_15)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a1)
-; RV32-NEXT:    addi a1, a4, %lo(.LCPI6_15)
+; RV32-NEXT:    lui a1, %hi(.LCPI6_15)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_15)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a3)
 ; RV32-NEXT:    vle16.v v8, (a1)
@@ -526,27 +521,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 6
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v20, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -554,7 +548,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    li a3, 72
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -562,12 +556,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 49
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -576,31 +570,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 21
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 13
+; RV32-NEXT:    li a2, 12
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 57
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmv.v.v v28, v0
 ; RV32-NEXT:    vmv.v.v v16, v8
 ; RV32-NEXT:    addi a1, a0, 320
@@ -614,21 +605,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 36
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 45
+; RV32-NEXT:    li a2, 44
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 82
+; RV32-NEXT:    li a1, 80
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
index 1748315186936..7608349ef7aef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
@@ -549,20 +549,20 @@ define <128 x i1> @buildvec_mask_v128i1() {
 define <128 x i1> @buildvec_mask_optsize_v128i1() optsize {
 ; CHECK-LABEL: buildvec_mask_optsize_v128i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI21_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI21_0)
-; CHECK-NEXT:    li a1, 128
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    lui a1, %hi(.LCPI21_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI21_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a1)
 ; CHECK-NEXT:    ret
 ;
 ; ZVE32F-LABEL: buildvec_mask_optsize_v128i1:
 ; ZVE32F:       # %bb.0:
-; ZVE32F-NEXT:    lui a0, %hi(.LCPI21_0)
-; ZVE32F-NEXT:    addi a0, a0, %lo(.LCPI21_0)
-; ZVE32F-NEXT:    li a1, 128
-; ZVE32F-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; ZVE32F-NEXT:    vlm.v v0, (a0)
+; ZVE32F-NEXT:    li a0, 128
+; ZVE32F-NEXT:    lui a1, %hi(.LCPI21_0)
+; ZVE32F-NEXT:    addi a1, a1, %lo(.LCPI21_0)
+; ZVE32F-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; ZVE32F-NEXT:    vlm.v v0, (a1)
 ; ZVE32F-NEXT:    ret
   ret <128 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index db0969c85a8e2..69341981288b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -13327,22 +13327,22 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) {
 define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ; RV32-LABEL: mgather_shuffle_vrgather:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, %hi(.LCPI119_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI119_0)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vle16.v v9, (a0)
-; RV32-NEXT:    lui a0, %hi(.LCPI119_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI119_0)
+; RV32-NEXT:    vle16.v v9, (a1)
 ; RV32-NEXT:    vle16.v v10, (a0)
-; RV32-NEXT:    vrgather.vv v8, v9, v10
+; RV32-NEXT:    vrgather.vv v8, v10, v9
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_shuffle_vrgather:
 ; RV64V:       # %bb.0:
+; RV64V-NEXT:    lui a1, %hi(.LCPI119_0)
+; RV64V-NEXT:    addi a1, a1, %lo(.LCPI119_0)
 ; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64V-NEXT:    vle16.v v9, (a0)
-; RV64V-NEXT:    lui a0, %hi(.LCPI119_0)
-; RV64V-NEXT:    addi a0, a0, %lo(.LCPI119_0)
+; RV64V-NEXT:    vle16.v v9, (a1)
 ; RV64V-NEXT:    vle16.v v10, (a0)
-; RV64V-NEXT:    vrgather.vv v8, v9, v10
+; RV64V-NEXT:    vrgather.vv v8, v10, v9
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_shuffle_vrgather:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index d70ed2fb0e266..4b1f0beb48700 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -228,11 +228,11 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) {
 define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: reverse_v32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI12_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI12_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI12_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -243,11 +243,11 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 define <64 x i8> @reverse_v64i8(<64 x i8> %a) {
 ; CHECK-LABEL: reverse_v64i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI13_0)
-; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    lui a1, %hi(.LCPI13_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI13_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vle8.v v16, (a1)
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -323,11 +323,11 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) {
 define <32 x i16> @reverse_v32i16(<32 x i16> %a) {
 ; CHECK-LABEL: reverse_v32i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI19_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI19_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI19_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vsext.vf2 v16, v12
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
@@ -520,11 +520,11 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) {
 define <32 x half> @reverse_v32f16(<32 x half> %a) {
 ; CHECK-LABEL: reverse_v32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI34_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI34_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI34_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vsext.vf2 v16, v12
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
@@ -820,33 +820,33 @@ define <6 x i64> @reverse_v6i64(<6 x i64> %a) {
 define <12 x i64> @reverse_v12i64(<12 x i64> %a) {
 ; RV32-BITS-UNKNOWN-LABEL: reverse_v12i64:
 ; RV32-BITS-UNKNOWN:       # %bb.0:
-; RV32-BITS-UNKNOWN-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-BITS-UNKNOWN-NEXT:    li a1, 32
-; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vle16.v v24, (a0)
+; RV32-BITS-UNKNOWN-NEXT:    li a0, 32
+; RV32-BITS-UNKNOWN-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-BITS-UNKNOWN-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-BITS-UNKNOWN-NEXT:    vle16.v v24, (a1)
 ; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-BITS-UNKNOWN-NEXT:    vmv.v.v v8, v16
 ; RV32-BITS-UNKNOWN-NEXT:    ret
 ;
 ; RV32-BITS-256-LABEL: reverse_v12i64:
 ; RV32-BITS-256:       # %bb.0:
-; RV32-BITS-256-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-BITS-256-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-BITS-256-NEXT:    li a1, 32
-; RV32-BITS-256-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-BITS-256-NEXT:    vle16.v v24, (a0)
+; RV32-BITS-256-NEXT:    li a0, 32
+; RV32-BITS-256-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-BITS-256-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-BITS-256-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-BITS-256-NEXT:    vle16.v v24, (a1)
 ; RV32-BITS-256-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-BITS-256-NEXT:    vmv.v.v v8, v16
 ; RV32-BITS-256-NEXT:    ret
 ;
 ; RV32-BITS-512-LABEL: reverse_v12i64:
 ; RV32-BITS-512:       # %bb.0:
-; RV32-BITS-512-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-BITS-512-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-BITS-512-NEXT:    li a1, 32
-; RV32-BITS-512-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-BITS-512-NEXT:    vle16.v v24, (a0)
+; RV32-BITS-512-NEXT:    li a0, 32
+; RV32-BITS-512-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-BITS-512-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-BITS-512-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-BITS-512-NEXT:    vle16.v v24, (a1)
 ; RV32-BITS-512-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-BITS-512-NEXT:    vmv.v.v v8, v16
 ; RV32-BITS-512-NEXT:    ret
@@ -883,11 +883,11 @@ define <12 x i64> @reverse_v12i64(<12 x i64> %a) {
 ;
 ; RV32-ZVBB-LABEL: reverse_v12i64:
 ; RV32-ZVBB:       # %bb.0:
-; RV32-ZVBB-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-ZVBB-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; RV32-ZVBB-NEXT:    li a1, 32
-; RV32-ZVBB-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-ZVBB-NEXT:    vle16.v v24, (a0)
+; RV32-ZVBB-NEXT:    li a0, 32
+; RV32-ZVBB-NEXT:    lui a1, %hi(.LCPI46_0)
+; RV32-ZVBB-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; RV32-ZVBB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-ZVBB-NEXT:    vle16.v v24, (a1)
 ; RV32-ZVBB-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV32-ZVBB-NEXT:    vmv.v.v v8, v16
 ; RV32-ZVBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll
index 0161ac4bc338d..e2580c132f65e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll
@@ -225,11 +225,11 @@ declare <16 x i64> @llvm.experimental.stepvector.v16i64()
 define <16 x i64> @stepvector_v16i64() {
 ; RV32-LABEL: stepvector_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI16_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI16_0)
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT:    vle8.v v16, (a0)
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    lui a1, %hi(.LCPI16_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_0)
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vle8.v v16, (a1)
 ; RV32-NEXT:    vsext.vf4 v8, v16
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
index 29f8eaba90052..e3c7d02462cc7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.sdiv.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vdiv_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vdiv_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vdiv.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.sdiv.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
index 3f8eb0ff276b7..03bd85bf5e69e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vdivu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vdivu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vdivu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.udiv.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
index 9789afda9344a..0b0d758ad8ded 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.smax.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vmax_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmax_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vmax.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.smax.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
index 36b0a4642b616..98e630a0e59e5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vmaxu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vmaxu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vmaxu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.umax.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
index adb0a30f34d35..a6e3764b37550 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.smin.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vmin_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmin_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vmin.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.smin.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
index 671ce82d4ae79..c59b65edd1ec1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vminu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vminu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vminu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.umin.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
index 4bbbad5ed0e0e..ff8a63e371c8e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
@@ -9,12 +9,11 @@ declare <8 x i7> @llvm.vp.srem.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
 define <8 x i7> @vrem_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vrem_vv_v8i7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vrem.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.srem.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
index ee11307bddc88..b5eec4142c782 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vremu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK-LABEL: vremu_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vremu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.urem.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
index c4b7c1f2f19f0..16a0fddfa9827 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
@@ -10,9 +10,8 @@ define <8 x i7> @vsll_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex
 ; CHECK-LABEL: vsll_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.shl.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
index 7ea5b1f0b505a..180fafa9659b1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
@@ -10,11 +10,10 @@ define <8 x i7> @vsra_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex
 ; CHECK-LABEL: vsra_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.ashr.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
index 9f9d4af0cc2f3..22f04803eadd7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
@@ -10,10 +10,9 @@ define <8 x i7> @vsrl_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroex
 ; CHECK-LABEL: vsrl_vv_v8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 127
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %v = call <8 x i7> @llvm.vp.lshr.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
index f88a9b3081a1a..372937bb5ca5d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
@@ -9,7 +9,7 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.nearbyint.nxv1f16(<vs
 define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
@@ -32,7 +33,7 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.nearbyint.nxv2f16(<vs
 define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -40,6 +41,7 @@ define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
@@ -55,7 +57,7 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.nearbyint.nxv4f16(<vs
 define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -63,6 +65,7 @@ define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
@@ -78,7 +81,7 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.nearbyint.nxv8f16(<vs
 define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -86,6 +89,7 @@ define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %v) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -101,7 +105,7 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.nearbyint.nxv16f16(<
 define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -109,6 +113,7 @@ define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %v) strictf
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -124,7 +129,7 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.nearbyint.nxv32f16(<
 define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -132,6 +137,7 @@ define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %v) strictf
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -147,7 +153,7 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.nearbyint.nxv1f32(<v
 define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -155,6 +161,7 @@ define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %v) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
@@ -170,7 +177,7 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.nearbyint.nxv2f32(<v
 define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -178,6 +185,7 @@ define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %v) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
@@ -193,7 +201,7 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.nearbyint.nxv4f32(<v
 define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -201,6 +209,7 @@ define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %v) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
@@ -216,7 +225,7 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.nearbyint.nxv8f32(<v
 define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -224,6 +233,7 @@ define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %v) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
@@ -239,7 +249,7 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.nearbyint.nxv16f32(
 define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
@@ -247,6 +257,7 @@ define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %v) stric
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
@@ -262,7 +273,7 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.nearbyint.nxv1f64(<
 define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -270,6 +281,7 @@ define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %v) strict
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
@@ -285,7 +297,7 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.nearbyint.nxv2f64(<
 define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -293,6 +305,7 @@ define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %v) strict
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -308,7 +321,7 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.nearbyint.nxv4f64(<
 define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -316,6 +329,7 @@ define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %v) strict
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -331,7 +345,7 @@ declare <vscale x 8 x double> @llvm.experimental.constrained.nearbyint.nxv8f64(<
 define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %v) strictfp {
 ; CHECK-LABEL: nearbyint_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -339,6 +353,7 @@ define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %v) strict
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
index 9da4d7ec9f2d0..4aa26d6b79ca4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
@@ -11,22 +11,22 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) {
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    addi a3, a2, 1
-; RV32-NEXT:    addi a4, a0, 1
+; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
+; RV32-NEXT:    vslideup.vx v8, v9, a2
+; RV32-NEXT:    addi a2, a0, 1
 ; RV32-NEXT:  .LBB0_1: # %for.body
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    th.lrb a0, a1, a0, 0
-; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vmv1r.v v10, v8
-; RV32-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
-; RV32-NEXT:    vslideup.vx v10, v9, a2
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, tu, ma
-; RV32-NEXT:    vmv.s.x v10, a0
-; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vmseq.vi v9, v10, 0
+; RV32-NEXT:    vmv1r.v v9, v8
+; RV32-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
+; RV32-NEXT:    vmv.s.x v9, a0
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmseq.vi v9, v9, 0
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    andi a5, a0, 255
-; RV32-NEXT:    mv a0, a4
-; RV32-NEXT:    bnez a5, .LBB0_1
+; RV32-NEXT:    andi a3, a0, 255
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    bnez a3, .LBB0_1
 ; RV32-NEXT:  # %bb.2: # %if.then381
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -37,23 +37,23 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) {
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a3
 ; RV64-NEXT:    addi a3, a2, 1
-; RV64-NEXT:    addi a4, a0, 1
+; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
+; RV64-NEXT:    vslideup.vx v8, v9, a2
+; RV64-NEXT:    addi a2, a0, 1
 ; RV64-NEXT:  .LBB0_1: # %for.body
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    th.lrb a0, a1, a0, 0
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vmv1r.v v10, v8
-; RV64-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
-; RV64-NEXT:    vslideup.vx v10, v9, a2
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, tu, ma
-; RV64-NEXT:    vmv.s.x v10, a0
-; RV64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vmseq.vi v9, v10, 0
+; RV64-NEXT:    vmv1r.v v9, v8
+; RV64-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmseq.vi v9, v9, 0
 ; RV64-NEXT:    vmv.x.s a0, v9
-; RV64-NEXT:    andi a5, a0, 255
-; RV64-NEXT:    mv a0, a4
-; RV64-NEXT:    bnez a5, .LBB0_1
+; RV64-NEXT:    andi a3, a0, 255
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    bnez a3, .LBB0_1
 ; RV64-NEXT:  # %bb.2: # %if.then381
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
index 3276f481f30ea..aaa7a538e70fb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
@@ -9,7 +9,7 @@
 define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -31,7 +32,7 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.round.nxv1f16(<vscale
 define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -39,6 +40,7 @@ define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -53,7 +55,7 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.round.nxv2f16(<vscale
 define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -61,6 +63,7 @@ define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -75,7 +78,7 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.round.nxv4f16(<vscale
 define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -83,6 +86,7 @@ define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -97,7 +101,7 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.round.nxv8f16(<vscale
 define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -105,6 +109,7 @@ define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -119,7 +124,7 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.round.nxv16f16(<vsca
 define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: round_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -127,6 +132,7 @@ define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -141,7 +147,7 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.round.nxv32f16(<vsca
 define <vscale x 1 x float> @round_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-LABEL: round_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -149,6 +155,7 @@ define <vscale x 1 x float> @round_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -163,7 +170,7 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.round.nxv1f32(<vscal
 define <vscale x 2 x float> @round_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-LABEL: round_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -171,6 +178,7 @@ define <vscale x 2 x float> @round_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -185,7 +193,7 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.round.nxv2f32(<vscal
 define <vscale x 4 x float> @round_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-LABEL: round_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -193,6 +201,7 @@ define <vscale x 4 x float> @round_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -207,7 +216,7 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.round.nxv4f32(<vscal
 define <vscale x 8 x float> @round_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-LABEL: round_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -215,6 +224,7 @@ define <vscale x 8 x float> @round_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -229,7 +239,7 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.round.nxv8f32(<vscal
 define <vscale x 16 x float> @round_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-LABEL: round_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
@@ -237,6 +247,7 @@ define <vscale x 16 x float> @round_nxv16f32(<vscale x 16 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -251,7 +262,7 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.round.nxv16f32(<vsc
 define <vscale x 1 x double> @round_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -259,6 +270,7 @@ define <vscale x 1 x double> @round_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -273,7 +285,7 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.round.nxv1f64(<vsca
 define <vscale x 2 x double> @round_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -281,6 +293,7 @@ define <vscale x 2 x double> @round_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -295,7 +308,7 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.round.nxv2f64(<vsca
 define <vscale x 4 x double> @round_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -303,6 +316,7 @@ define <vscale x 4 x double> @round_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -317,7 +331,7 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.round.nxv4f64(<vsca
 define <vscale x 8 x double> @round_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: round_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -325,6 +339,7 @@ define <vscale x 8 x double> @round_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
index 4ebfcccbaaa6e..cdc01d658778b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
@@ -9,7 +9,7 @@
 define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
@@ -17,6 +17,7 @@ define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -31,7 +32,7 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.roundeven.nxv1f16(<vs
 define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
@@ -39,6 +40,7 @@ define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -53,7 +55,7 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.roundeven.nxv2f16(<vs
 define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
@@ -61,6 +63,7 @@ define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -75,7 +78,7 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.roundeven.nxv4f16(<vs
 define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
@@ -83,6 +86,7 @@ define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -97,7 +101,7 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.roundeven.nxv8f16(<vs
 define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
@@ -105,6 +109,7 @@ define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) strictf
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -119,7 +124,7 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.roundeven.nxv16f16(<
 define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
@@ -127,6 +132,7 @@ define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) strictf
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -141,7 +147,7 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.roundeven.nxv32f16(<
 define <vscale x 1 x float> @roundeven_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -149,6 +155,7 @@ define <vscale x 1 x float> @roundeven_nxv1f32(<vscale x 1 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -163,7 +170,7 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.roundeven.nxv1f32(<v
 define <vscale x 2 x float> @roundeven_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
@@ -171,6 +178,7 @@ define <vscale x 2 x float> @roundeven_nxv2f32(<vscale x 2 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -185,7 +193,7 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.roundeven.nxv2f32(<v
 define <vscale x 4 x float> @roundeven_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
@@ -193,6 +201,7 @@ define <vscale x 4 x float> @roundeven_nxv4f32(<vscale x 4 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -207,7 +216,7 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.roundeven.nxv4f32(<v
 define <vscale x 8 x float> @roundeven_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
@@ -215,6 +224,7 @@ define <vscale x 8 x float> @roundeven_nxv8f32(<vscale x 8 x float> %x) strictfp
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -229,7 +239,7 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.roundeven.nxv8f32(<v
 define <vscale x 16 x float> @roundeven_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
@@ -237,6 +247,7 @@ define <vscale x 16 x float> @roundeven_nxv16f32(<vscale x 16 x float> %x) stric
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -251,7 +262,7 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.roundeven.nxv16f32(
 define <vscale x 1 x double> @roundeven_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
@@ -259,6 +270,7 @@ define <vscale x 1 x double> @roundeven_nxv1f64(<vscale x 1 x double> %x) strict
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -273,7 +285,7 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.roundeven.nxv1f64(<
 define <vscale x 2 x double> @roundeven_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
@@ -281,6 +293,7 @@ define <vscale x 2 x double> @roundeven_nxv2f64(<vscale x 2 x double> %x) strict
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
@@ -295,7 +308,7 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.roundeven.nxv2f64(<
 define <vscale x 4 x double> @roundeven_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
@@ -303,6 +316,7 @@ define <vscale x 4 x double> @roundeven_nxv4f64(<vscale x 4 x double> %x) strict
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
@@ -317,7 +331,7 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.roundeven.nxv4f64(<
 define <vscale x 8 x double> @roundeven_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: roundeven_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
@@ -325,6 +339,7 @@ define <vscale x 8 x double> @roundeven_nxv8f64(<vscale x 8 x double> %x) strict
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index bc5617957d7d0..2c5a3dfffc2cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -1282,18 +1282,17 @@ define <vscale x 1 x i9> @fshr_v1i9(<vscale x 1 x i9> %a, <vscale x 1 x i9> %b,
 ; CHECK-LABEL: fshr_v1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a1, v0.t
 ; CHECK-NEXT:    li a0, 9
 ; CHECK-NEXT:    vremu.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vadd.vi v10, v10, 7, v0.t
 ; CHECK-NEXT:    vand.vi v11, v10, 15, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
 ; CHECK-NEXT:    vsrl.vv v9, v9, v11, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vnot.v v10, v10, v0.t
 ; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
@@ -1306,18 +1305,17 @@ define <vscale x 1 x i9> @fshl_v1i9(<vscale x 1 x i9> %a, <vscale x 1 x i9> %b,
 ; CHECK-LABEL: fshl_v1i9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 511
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vand.vx v10, v10, a1, v0.t
 ; CHECK-NEXT:    li a0, 9
 ; CHECK-NEXT:    vremu.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vnot.v v11, v10, v0.t
-; CHECK-NEXT:    vand.vi v11, v11, 15, v0.t
-; CHECK-NEXT:    vsrl.vv v9, v9, v11, v0.t
+; CHECK-NEXT:    vand.vi v11, v10, 15, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    vnot.v v10, v10, v0.t
 ; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
-; CHECK-NEXT:    vsll.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 7, v0.t
+; CHECK-NEXT:    vsrl.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %res = call <vscale x 1 x i9> @llvm.vp.fshl.nxv1i9(<vscale x 1 x i9> %a, <vscale x 1 x i9> %b, <vscale x 1 x i9> %c, <vscale x 1 x i1> %m, i32 %evl)
@@ -1330,15 +1328,14 @@ declare <vscale x 1 x i4> @llvm.vp.fshr.nxv1i4(<vscale x 1 x i4>, <vscale x 1 x
 define <vscale x 1 x i8> @fshr_v1i4(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fshr_v1i4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vand.vi v10, v10, 15
-; CHECK-NEXT:    li a1, 4
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vremu.vx v10, v10, a1, v0.t
+; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
 ; CHECK-NEXT:    vand.vi v9, v9, 15, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsrl.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vremu.vx v9, v10, a0, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    ret
   %trunca = call <vscale x 1 x i4> @llvm.vp.trunc.nxv1i4.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i1> %m, i32 zeroext %evl)
@@ -1353,15 +1350,14 @@ declare <vscale x 1 x i4> @llvm.vp.fshl.nxv1i4(<vscale x 1 x i4>, <vscale x 1 x
 define <vscale x 1 x i8> @fshl_v1i4(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: fshl_v1i4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vand.vi v10, v10, 15
-; CHECK-NEXT:    li a1, 4
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vremu.vx v10, v10, a1, v0.t
+; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
 ; CHECK-NEXT:    vand.vi v9, v9, 15, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    vsll.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    vremu.vx v9, v10, a0, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
index 3665669d83a3d..21615b516da89 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
@@ -7,13 +7,14 @@
 define <vscale x 1 x half> @trunc_nxv1f16(<vscale x 1 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI0_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
@@ -27,13 +28,14 @@ declare <vscale x 1 x half> @llvm.experimental.constrained.trunc.nxv1f16(<vscale
 define <vscale x 2 x half> @trunc_nxv2f16(<vscale x 2 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI1_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
@@ -47,13 +49,14 @@ declare <vscale x 2 x half> @llvm.experimental.constrained.trunc.nxv2f16(<vscale
 define <vscale x 4 x half> @trunc_nxv4f16(<vscale x 4 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI2_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
@@ -67,13 +70,14 @@ declare <vscale x 4 x half> @llvm.experimental.constrained.trunc.nxv4f16(<vscale
 define <vscale x 8 x half> @trunc_nxv8f16(<vscale x 8 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI3_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI3_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -87,13 +91,14 @@ declare <vscale x 8 x half> @llvm.experimental.constrained.trunc.nxv8f16(<vscale
 define <vscale x 16 x half> @trunc_nxv16f16(<vscale x 16 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv16f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -107,13 +112,14 @@ declare <vscale x 16 x half> @llvm.experimental.constrained.trunc.nxv16f16(<vsca
 define <vscale x 32 x half> @trunc_nxv32f16(<vscale x 32 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv32f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -127,13 +133,14 @@ declare <vscale x 32 x half> @llvm.experimental.constrained.trunc.nxv32f16(<vsca
 define <vscale x 1 x float> @trunc_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
@@ -147,13 +154,14 @@ declare <vscale x 1 x float> @llvm.experimental.constrained.trunc.nxv1f32(<vscal
 define <vscale x 2 x float> @trunc_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
@@ -167,13 +175,14 @@ declare <vscale x 2 x float> @llvm.experimental.constrained.trunc.nxv2f32(<vscal
 define <vscale x 4 x float> @trunc_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
@@ -187,13 +196,14 @@ declare <vscale x 4 x float> @llvm.experimental.constrained.trunc.nxv4f32(<vscal
 define <vscale x 8 x float> @trunc_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
@@ -207,13 +217,14 @@ declare <vscale x 8 x float> @llvm.experimental.constrained.trunc.nxv8f32(<vscal
 define <vscale x 16 x float> @trunc_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
@@ -227,13 +238,14 @@ declare <vscale x 16 x float> @llvm.experimental.constrained.trunc.nxv16f32(<vsc
 define <vscale x 1 x double> @trunc_nxv1f64(<vscale x 1 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv1f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI11_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
@@ -247,13 +259,14 @@ declare <vscale x 1 x double> @llvm.experimental.constrained.trunc.nxv1f64(<vsca
 define <vscale x 2 x double> @trunc_nxv2f64(<vscale x 2 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI12_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -267,13 +280,14 @@ declare <vscale x 2 x double> @llvm.experimental.constrained.trunc.nxv2f64(<vsca
 define <vscale x 4 x double> @trunc_nxv4f64(<vscale x 4 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI13_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI13_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -287,13 +301,14 @@ declare <vscale x 4 x double> @llvm.experimental.constrained.trunc.nxv4f64(<vsca
 define <vscale x 8 x double> @trunc_nxv8f64(<vscale x 8 x double> %x) strictfp {
 ; CHECK-LABEL: trunc_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll
new file mode 100644
index 0000000000000..6a7da925b4d43
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/mul-combine.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64
+
+define <2 x i16> @test_v2i16(<2 x i16> %x) {
+; CHECK-RV32-LABEL: test_v2i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 7
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_v2i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 7
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <2 x i16> %x, <i16 7, i16 7>
+  %2 = and <2 x i16> %1, <i16 257, i16 257>
+  %3 = mul <2 x i16> %2, <i16 255, i16 255>
+  ret <2 x i16> %3
+}
+
+define <vscale x 2 x i16> @test_nxv2i16(<vscale x 2 x i16> %x) {
+; CHECK-RV32-LABEL: test_nxv2i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-RV32-NEXT:    vsrl.vi v8, v8, 7
+; CHECK-RV32-NEXT:    li a0, 257
+; CHECK-RV32-NEXT:    vand.vx v8, v8, a0
+; CHECK-RV32-NEXT:    vsll.vi v8, v8, 8
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_nxv2i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-RV64-NEXT:    vsrl.vi v8, v8, 7
+; CHECK-RV64-NEXT:    li a0, 257
+; CHECK-RV64-NEXT:    vand.vx v8, v8, a0
+; CHECK-RV64-NEXT:    vsll.vi v8, v8, 8
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <vscale x 2 x i16> %x, splat (i16 7)
+  %2 = and <vscale x 2 x i16> %1, splat (i16 257)
+  %3 = mul <vscale x 2 x i16> %2, splat (i16 256)
+  ret <vscale x 2 x i16> %3
+}
+
+define <2 x i32> @test_v2i32(<2 x i32> %x) {
+; CHECK-RV32-LABEL: test_v2i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_v2i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <2 x i32> %x, <i32 15, i32 15>
+  %2 = and <2 x i32> %1, <i32 65537, i32 65537>
+  %3 = mul <2 x i32> %2, <i32 65535, i32 65535>
+  ret <2 x i32> %3
+}
+
+define <vscale x 2 x i32> @test_nxv2i32(<vscale x 2 x i32> %x) {
+; CHECK-RV32-LABEL: test_nxv2i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_nxv2i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 15
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <vscale x 2 x i32> %x, splat (i32 15)
+  %2 = and <vscale x 2 x i32> %1, splat (i32 65537)
+  %3 = mul <vscale x 2 x i32> %2, splat (i32 65535)
+  ret <vscale x 2 x i32> %3
+}
+
+define <2 x i64> @test_v2i64(<2 x i64> %x) {
+; CHECK-RV32-LABEL: test_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <2 x i64> %x, <i64 31, i64 31>
+  %2 = and <2 x i64> %1, <i64 4294967297, i64 4294967297>
+  %3 = mul <2 x i64> %2, <i64 4294967295, i64 4294967295>
+  ret <2 x i64> %3
+}
+
+define <vscale x 2 x i64> @test_nxv2i64(<vscale x 2 x i64> %x) {
+; CHECK-RV32-LABEL: test_nxv2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-RV32-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: test_nxv2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-RV64-NEXT:    vsra.vi v8, v8, 31
+; CHECK-RV64-NEXT:    ret
+  %1 = lshr <vscale x 2 x i64> %x, splat (i64 31)
+  %2 = and <vscale x 2 x i64> %1, splat (i64 4294967297)
+  %3 = mul <vscale x 2 x i64> %2, splat (i64 4294967295)
+  ret <vscale x 2 x i64> %3
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
index 6e327457bebff..368f454fa5fda 100644
--- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
@@ -106,11 +106,11 @@ define <16 x i8> @v16i8(<16 x i8> %a) {
 define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: v16i8_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI7_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI7_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI7_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI7_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vmv1r.v v14, v9
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vid.v v8
@@ -230,11 +230,11 @@ define <16 x i16> @v16i16(<16 x i16> %a) {
 define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-LABEL: v16i16_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI15_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI15_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI15_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI15_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle16.v v16, (a1)
 ; CHECK-NEXT:    vmv2r.v v20, v10
 ; CHECK-NEXT:    vmv2r.v v12, v8
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16
@@ -363,11 +363,11 @@ define <16 x i32> @v16i32(<16 x i32> %a) {
 define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: v16i32_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI23_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI23_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle16.v v20, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI23_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI23_0)
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vle16.v v20, (a1)
 ; CHECK-NEXT:    vmv4r.v v24, v12
 ; CHECK-NEXT:    vmv4r.v v16, v8
 ; CHECK-NEXT:    vrgatherei16.vv v8, v16, v20
@@ -548,11 +548,11 @@ define <16 x half> @v16f16(<16 x half> %a) {
 define <32 x half> @v16f16_2(<16 x half> %a) {
 ; CHECK-LABEL: v16f16_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI35_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI35_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI35_0)
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vle16.v v16, (a1)
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
@@ -719,11 +719,11 @@ define <8 x double> @v4f64_2(<4 x double> %a, <4 x double> %b) {
 define <32 x i8> @v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI46_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI46_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI46_0)
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
index 26089706cf99e..a4b7ca7f39768 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.sdiv.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vdiv_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vdiv_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vdiv.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vdiv.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
index f41b885a66eaa..67c3f9dbf2869 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vdivu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vdivu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vdivu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 8a76467986620..c15caa31bb098 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.smax.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vmax_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vmax_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vmax.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index 1c74887c1b20f..df494f8af7387 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vmaxu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vmaxu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vmaxu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index 1c71242c3c7d7..794a21c7c6aba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.smin.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vmin_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vmin_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vmin.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index 6d89a9777cf91..d54de281a7fd2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vminu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vminu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vminu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
index cf85fd827b51f..2ef96f4b3896f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
@@ -9,11 +9,15 @@ declare <vscale x 8 x i7> @llvm.vp.srem.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vrem_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vrem_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vrem.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    vrem.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
   %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
index 61bdd5b8d3c8a..1f1ed4a1269ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vremu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vremu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vremu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
index c04d5ea2da3c1..380835494ed17 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
@@ -12,8 +12,8 @@ define <vscale x 8 x i7> @vsll_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll
index 7bae84142d8ae..8dbb57fd15cf1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsra-sdnode.ll
@@ -934,3 +934,22 @@ define <vscale x 8 x i32> @vsra_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
   %vc = ashr <vscale x 8 x i32> %va, %vs
   ret <vscale x 8 x i32> %vc
 }
+
+; Negative test. We shouldn't look through the vp.trunc as it isn't vlmax like
+; the rest of the code.
+define <vscale x 1 x i8> @vsra_vv_nxv1i8_sext_zext_mixed_trunc(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl) {
+; CHECK-LABEL: vsra_vv_nxv1i8_sext_zext_mixed_trunc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 7
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmin.vx v9, v8, a0
+; CHECK-NEXT:    vsra.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %sexted_va = sext <vscale x 1 x i8> %va to <vscale x 1 x i32>
+  %zexted_vb = zext <vscale x 1 x i8> %va to <vscale x 1 x i32>
+  %expand = ashr <vscale x 1 x i32> %sexted_va, %zexted_vb
+  %vc = trunc <vscale x 1 x i32> %expand to <vscale x 1 x i16>
+  %vd = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxvi16(<vscale x 1 x i16> %vc, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %vd
+}
+declare <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxvi16(<vscale x 1 x i16>, <vscale x 1 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
index 632c4db5c5bb5..cff8cc710d21f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
@@ -9,13 +9,14 @@ declare <vscale x 8 x i7> @llvm.vp.ashr.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x
 define <vscale x 8 x i7> @vsra_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: vsra_vx_nxv8i7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
index ec5b7f3faf7ca..ff6771b643031 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
@@ -10,11 +10,12 @@ define <vscale x 8 x i7> @vsrl_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK-LABEL: vsrl_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
diff --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll
new file mode 100644
index 0000000000000..0eeb8b04c7e5d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll
@@ -0,0 +1,807 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr=+m < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+m,+zbb < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32ZBB
+; RUN: llc -mtriple=riscv64 -mattr=+m < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64I,RV64IILLEGALI32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zbb < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64ZBB,RV64ZBBILLEGALI32
+; RUN: llc -mtriple=riscv64 -mattr=+m -riscv-experimental-rv64-legal-i32 < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64I,RV64ILEGALI32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zbb -riscv-experimental-rv64-legal-i32 < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64ZBB,RV64ZBBLEGALI32
+
+define i8 @shl_cttz_i8(i8 %x, i8 %y) {
+; RV32I-LABEL: shl_cttz_i8:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi a2, a1, -1
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a1, 1
+; RV32I-NEXT:    andi a2, a2, 85
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    andi a2, a1, 51
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    andi a1, a1, 51
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    andi a1, a1, 15
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_i8:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a1, a1
+; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64IILLEGALI32-LABEL: shl_cttz_i8:
+; RV64IILLEGALI32:       # %bb.0: # %entry
+; RV64IILLEGALI32-NEXT:    addi a2, a1, -1
+; RV64IILLEGALI32-NEXT:    not a1, a1
+; RV64IILLEGALI32-NEXT:    and a1, a1, a2
+; RV64IILLEGALI32-NEXT:    srli a2, a1, 1
+; RV64IILLEGALI32-NEXT:    andi a2, a2, 85
+; RV64IILLEGALI32-NEXT:    subw a1, a1, a2
+; RV64IILLEGALI32-NEXT:    andi a2, a1, 51
+; RV64IILLEGALI32-NEXT:    srli a1, a1, 2
+; RV64IILLEGALI32-NEXT:    andi a1, a1, 51
+; RV64IILLEGALI32-NEXT:    add a1, a2, a1
+; RV64IILLEGALI32-NEXT:    srli a2, a1, 4
+; RV64IILLEGALI32-NEXT:    add a1, a1, a2
+; RV64IILLEGALI32-NEXT:    andi a1, a1, 15
+; RV64IILLEGALI32-NEXT:    sll a0, a0, a1
+; RV64IILLEGALI32-NEXT:    ret
+;
+; RV64ZBBILLEGALI32-LABEL: shl_cttz_i8:
+; RV64ZBBILLEGALI32:       # %bb.0: # %entry
+; RV64ZBBILLEGALI32-NEXT:    ctz a1, a1
+; RV64ZBBILLEGALI32-NEXT:    sll a0, a0, a1
+; RV64ZBBILLEGALI32-NEXT:    ret
+;
+; RV64ILEGALI32-LABEL: shl_cttz_i8:
+; RV64ILEGALI32:       # %bb.0: # %entry
+; RV64ILEGALI32-NEXT:    addi a2, a1, -1
+; RV64ILEGALI32-NEXT:    not a1, a1
+; RV64ILEGALI32-NEXT:    and a1, a1, a2
+; RV64ILEGALI32-NEXT:    srliw a2, a1, 1
+; RV64ILEGALI32-NEXT:    andi a2, a2, 85
+; RV64ILEGALI32-NEXT:    subw a1, a1, a2
+; RV64ILEGALI32-NEXT:    andi a2, a1, 51
+; RV64ILEGALI32-NEXT:    srliw a1, a1, 2
+; RV64ILEGALI32-NEXT:    andi a1, a1, 51
+; RV64ILEGALI32-NEXT:    add a1, a2, a1
+; RV64ILEGALI32-NEXT:    srliw a2, a1, 4
+; RV64ILEGALI32-NEXT:    add a1, a1, a2
+; RV64ILEGALI32-NEXT:    andi a1, a1, 15
+; RV64ILEGALI32-NEXT:    sllw a0, a0, a1
+; RV64ILEGALI32-NEXT:    ret
+;
+; RV64ZBBLEGALI32-LABEL: shl_cttz_i8:
+; RV64ZBBLEGALI32:       # %bb.0: # %entry
+; RV64ZBBLEGALI32-NEXT:    ctzw a1, a1
+; RV64ZBBLEGALI32-NEXT:    sllw a0, a0, a1
+; RV64ZBBLEGALI32-NEXT:    ret
+entry:
+  %cttz = call i8 @llvm.cttz.i8(i8 %y, i1 true)
+  %res = shl i8 %x, %cttz
+  ret i8 %res
+}
+
+define i8 @shl_cttz_constant_i8(i8 %y) {
+; RV32I-LABEL: shl_cttz_constant_i8:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    li a1, 4
+; RV32I-NEXT:    sll a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_constant_i8:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 4
+; RV32ZBB-NEXT:    sll a0, a1, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV64IILLEGALI32-LABEL: shl_cttz_constant_i8:
+; RV64IILLEGALI32:       # %bb.0: # %entry
+; RV64IILLEGALI32-NEXT:    addi a1, a0, -1
+; RV64IILLEGALI32-NEXT:    not a0, a0
+; RV64IILLEGALI32-NEXT:    and a0, a0, a1
+; RV64IILLEGALI32-NEXT:    srli a1, a0, 1
+; RV64IILLEGALI32-NEXT:    andi a1, a1, 85
+; RV64IILLEGALI32-NEXT:    subw a0, a0, a1
+; RV64IILLEGALI32-NEXT:    andi a1, a0, 51
+; RV64IILLEGALI32-NEXT:    srli a0, a0, 2
+; RV64IILLEGALI32-NEXT:    andi a0, a0, 51
+; RV64IILLEGALI32-NEXT:    add a0, a1, a0
+; RV64IILLEGALI32-NEXT:    srli a1, a0, 4
+; RV64IILLEGALI32-NEXT:    add a0, a0, a1
+; RV64IILLEGALI32-NEXT:    andi a0, a0, 15
+; RV64IILLEGALI32-NEXT:    li a1, 4
+; RV64IILLEGALI32-NEXT:    sll a0, a1, a0
+; RV64IILLEGALI32-NEXT:    ret
+;
+; RV64ZBBILLEGALI32-LABEL: shl_cttz_constant_i8:
+; RV64ZBBILLEGALI32:       # %bb.0: # %entry
+; RV64ZBBILLEGALI32-NEXT:    ctz a0, a0
+; RV64ZBBILLEGALI32-NEXT:    li a1, 4
+; RV64ZBBILLEGALI32-NEXT:    sll a0, a1, a0
+; RV64ZBBILLEGALI32-NEXT:    ret
+;
+; RV64ILEGALI32-LABEL: shl_cttz_constant_i8:
+; RV64ILEGALI32:       # %bb.0: # %entry
+; RV64ILEGALI32-NEXT:    addi a1, a0, -1
+; RV64ILEGALI32-NEXT:    not a0, a0
+; RV64ILEGALI32-NEXT:    and a0, a0, a1
+; RV64ILEGALI32-NEXT:    srliw a1, a0, 1
+; RV64ILEGALI32-NEXT:    andi a1, a1, 85
+; RV64ILEGALI32-NEXT:    subw a0, a0, a1
+; RV64ILEGALI32-NEXT:    andi a1, a0, 51
+; RV64ILEGALI32-NEXT:    srliw a0, a0, 2
+; RV64ILEGALI32-NEXT:    andi a0, a0, 51
+; RV64ILEGALI32-NEXT:    add a0, a1, a0
+; RV64ILEGALI32-NEXT:    srliw a1, a0, 4
+; RV64ILEGALI32-NEXT:    add a0, a0, a1
+; RV64ILEGALI32-NEXT:    andi a0, a0, 15
+; RV64ILEGALI32-NEXT:    li a1, 4
+; RV64ILEGALI32-NEXT:    sllw a0, a1, a0
+; RV64ILEGALI32-NEXT:    ret
+;
+; RV64ZBBLEGALI32-LABEL: shl_cttz_constant_i8:
+; RV64ZBBLEGALI32:       # %bb.0: # %entry
+; RV64ZBBLEGALI32-NEXT:    ctzw a0, a0
+; RV64ZBBLEGALI32-NEXT:    li a1, 4
+; RV64ZBBLEGALI32-NEXT:    sllw a0, a1, a0
+; RV64ZBBLEGALI32-NEXT:    ret
+entry:
+  %cttz = call i8 @llvm.cttz.i8(i8 %y, i1 true)
+  %res = shl i8 4, %cttz
+  ret i8 %res
+}
+
+define i16 @shl_cttz_i16(i16 %x, i16 %y) {
+; RV32I-LABEL: shl_cttz_i16:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi a2, a1, -1
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a1, 1
+; RV32I-NEXT:    lui a3, 5
+; RV32I-NEXT:    addi a3, a3, 1365
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    and a3, a1, a2
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    andi a2, a1, 15
+; RV32I-NEXT:    slli a1, a1, 20
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_i16:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a1, a1
+; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64IILLEGALI32-LABEL: shl_cttz_i16:
+; RV64IILLEGALI32:       # %bb.0: # %entry
+; RV64IILLEGALI32-NEXT:    addi a2, a1, -1
+; RV64IILLEGALI32-NEXT:    not a1, a1
+; RV64IILLEGALI32-NEXT:    and a1, a1, a2
+; RV64IILLEGALI32-NEXT:    srli a2, a1, 1
+; RV64IILLEGALI32-NEXT:    lui a3, 5
+; RV64IILLEGALI32-NEXT:    addiw a3, a3, 1365
+; RV64IILLEGALI32-NEXT:    and a2, a2, a3
+; RV64IILLEGALI32-NEXT:    sub a1, a1, a2
+; RV64IILLEGALI32-NEXT:    lui a2, 3
+; RV64IILLEGALI32-NEXT:    addiw a2, a2, 819
+; RV64IILLEGALI32-NEXT:    and a3, a1, a2
+; RV64IILLEGALI32-NEXT:    srli a1, a1, 2
+; RV64IILLEGALI32-NEXT:    and a1, a1, a2
+; RV64IILLEGALI32-NEXT:    add a1, a3, a1
+; RV64IILLEGALI32-NEXT:    srli a2, a1, 4
+; RV64IILLEGALI32-NEXT:    add a1, a1, a2
+; RV64IILLEGALI32-NEXT:    andi a2, a1, 15
+; RV64IILLEGALI32-NEXT:    slli a1, a1, 52
+; RV64IILLEGALI32-NEXT:    srli a1, a1, 60
+; RV64IILLEGALI32-NEXT:    add a1, a2, a1
+; RV64IILLEGALI32-NEXT:    sll a0, a0, a1
+; RV64IILLEGALI32-NEXT:    ret
+;
+; RV64ZBBILLEGALI32-LABEL: shl_cttz_i16:
+; RV64ZBBILLEGALI32:       # %bb.0: # %entry
+; RV64ZBBILLEGALI32-NEXT:    ctz a1, a1
+; RV64ZBBILLEGALI32-NEXT:    sll a0, a0, a1
+; RV64ZBBILLEGALI32-NEXT:    ret
+;
+; RV64ILEGALI32-LABEL: shl_cttz_i16:
+; RV64ILEGALI32:       # %bb.0: # %entry
+; RV64ILEGALI32-NEXT:    addi a2, a1, -1
+; RV64ILEGALI32-NEXT:    not a1, a1
+; RV64ILEGALI32-NEXT:    and a1, a1, a2
+; RV64ILEGALI32-NEXT:    srliw a2, a1, 1
+; RV64ILEGALI32-NEXT:    lui a3, 5
+; RV64ILEGALI32-NEXT:    addi a3, a3, 1365
+; RV64ILEGALI32-NEXT:    and a2, a2, a3
+; RV64ILEGALI32-NEXT:    subw a1, a1, a2
+; RV64ILEGALI32-NEXT:    lui a2, 3
+; RV64ILEGALI32-NEXT:    addi a2, a2, 819
+; RV64ILEGALI32-NEXT:    and a3, a1, a2
+; RV64ILEGALI32-NEXT:    srliw a1, a1, 2
+; RV64ILEGALI32-NEXT:    and a1, a1, a2
+; RV64ILEGALI32-NEXT:    add a1, a3, a1
+; RV64ILEGALI32-NEXT:    srliw a2, a1, 4
+; RV64ILEGALI32-NEXT:    add a1, a1, a2
+; RV64ILEGALI32-NEXT:    andi a2, a1, 15
+; RV64ILEGALI32-NEXT:    slli a1, a1, 52
+; RV64ILEGALI32-NEXT:    srli a1, a1, 60
+; RV64ILEGALI32-NEXT:    add a1, a2, a1
+; RV64ILEGALI32-NEXT:    sllw a0, a0, a1
+; RV64ILEGALI32-NEXT:    ret
+;
+; RV64ZBBLEGALI32-LABEL: shl_cttz_i16:
+; RV64ZBBLEGALI32:       # %bb.0: # %entry
+; RV64ZBBLEGALI32-NEXT:    ctzw a1, a1
+; RV64ZBBLEGALI32-NEXT:    sllw a0, a0, a1
+; RV64ZBBLEGALI32-NEXT:    ret
+entry:
+  %cttz = call i16 @llvm.cttz.i16(i16 %y, i1 true)
+  %res = shl i16 %x, %cttz
+  ret i16 %res
+}
+
+define i16 @shl_cttz_constant_i16(i16 %y) {
+; RV32I-LABEL: shl_cttz_constant_i16:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lui a1, 3
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    li a1, 4
+; RV32I-NEXT:    sll a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_constant_i16:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 4
+; RV32ZBB-NEXT:    sll a0, a1, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV64IILLEGALI32-LABEL: shl_cttz_constant_i16:
+; RV64IILLEGALI32:       # %bb.0: # %entry
+; RV64IILLEGALI32-NEXT:    addi a1, a0, -1
+; RV64IILLEGALI32-NEXT:    not a0, a0
+; RV64IILLEGALI32-NEXT:    and a0, a0, a1
+; RV64IILLEGALI32-NEXT:    srli a1, a0, 1
+; RV64IILLEGALI32-NEXT:    lui a2, 5
+; RV64IILLEGALI32-NEXT:    addiw a2, a2, 1365
+; RV64IILLEGALI32-NEXT:    and a1, a1, a2
+; RV64IILLEGALI32-NEXT:    sub a0, a0, a1
+; RV64IILLEGALI32-NEXT:    lui a1, 3
+; RV64IILLEGALI32-NEXT:    addiw a1, a1, 819
+; RV64IILLEGALI32-NEXT:    and a2, a0, a1
+; RV64IILLEGALI32-NEXT:    srli a0, a0, 2
+; RV64IILLEGALI32-NEXT:    and a0, a0, a1
+; RV64IILLEGALI32-NEXT:    add a0, a2, a0
+; RV64IILLEGALI32-NEXT:    srli a1, a0, 4
+; RV64IILLEGALI32-NEXT:    add a0, a0, a1
+; RV64IILLEGALI32-NEXT:    andi a1, a0, 15
+; RV64IILLEGALI32-NEXT:    slli a0, a0, 52
+; RV64IILLEGALI32-NEXT:    srli a0, a0, 60
+; RV64IILLEGALI32-NEXT:    add a0, a1, a0
+; RV64IILLEGALI32-NEXT:    li a1, 4
+; RV64IILLEGALI32-NEXT:    sll a0, a1, a0
+; RV64IILLEGALI32-NEXT:    ret
+;
+; RV64ZBBILLEGALI32-LABEL: shl_cttz_constant_i16:
+; RV64ZBBILLEGALI32:       # %bb.0: # %entry
+; RV64ZBBILLEGALI32-NEXT:    ctz a0, a0
+; RV64ZBBILLEGALI32-NEXT:    li a1, 4
+; RV64ZBBILLEGALI32-NEXT:    sll a0, a1, a0
+; RV64ZBBILLEGALI32-NEXT:    ret
+;
+; RV64ILEGALI32-LABEL: shl_cttz_constant_i16:
+; RV64ILEGALI32:       # %bb.0: # %entry
+; RV64ILEGALI32-NEXT:    addi a1, a0, -1
+; RV64ILEGALI32-NEXT:    not a0, a0
+; RV64ILEGALI32-NEXT:    and a0, a0, a1
+; RV64ILEGALI32-NEXT:    srliw a1, a0, 1
+; RV64ILEGALI32-NEXT:    lui a2, 5
+; RV64ILEGALI32-NEXT:    addi a2, a2, 1365
+; RV64ILEGALI32-NEXT:    and a1, a1, a2
+; RV64ILEGALI32-NEXT:    subw a0, a0, a1
+; RV64ILEGALI32-NEXT:    lui a1, 3
+; RV64ILEGALI32-NEXT:    addi a1, a1, 819
+; RV64ILEGALI32-NEXT:    and a2, a0, a1
+; RV64ILEGALI32-NEXT:    srliw a0, a0, 2
+; RV64ILEGALI32-NEXT:    and a0, a0, a1
+; RV64ILEGALI32-NEXT:    add a0, a2, a0
+; RV64ILEGALI32-NEXT:    srliw a1, a0, 4
+; RV64ILEGALI32-NEXT:    add a0, a0, a1
+; RV64ILEGALI32-NEXT:    andi a1, a0, 15
+; RV64ILEGALI32-NEXT:    slli a0, a0, 52
+; RV64ILEGALI32-NEXT:    srli a0, a0, 60
+; RV64ILEGALI32-NEXT:    add a0, a1, a0
+; RV64ILEGALI32-NEXT:    li a1, 4
+; RV64ILEGALI32-NEXT:    sllw a0, a1, a0
+; RV64ILEGALI32-NEXT:    ret
+;
+; RV64ZBBLEGALI32-LABEL: shl_cttz_constant_i16:
+; RV64ZBBLEGALI32:       # %bb.0: # %entry
+; RV64ZBBLEGALI32-NEXT:    ctzw a0, a0
+; RV64ZBBLEGALI32-NEXT:    li a1, 4
+; RV64ZBBLEGALI32-NEXT:    sllw a0, a1, a0
+; RV64ZBBLEGALI32-NEXT:    ret
+entry:
+  %cttz = call i16 @llvm.cttz.i16(i16 %y, i1 true)
+  %res = shl i16 4, %cttz
+  ret i16 %res
+}
+
+define i32 @shl_cttz_i32(i32 %x, i32 %y) {
+; RV32I-LABEL: shl_cttz_i32:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    neg a2, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    mul a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_i32:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a1, a1
+; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_i32:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 30667
+; RV64I-NEXT:    addi a2, a2, 1329
+; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    srliw a1, a1, 27
+; RV64I-NEXT:    lui a2, %hi(.LCPI4_0)
+; RV64I-NEXT:    addi a2, a2, %lo(.LCPI4_0)
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_i32:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    ctzw a1, a1
+; RV64ZBB-NEXT:    sllw a0, a0, a1
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 true)
+  %res = shl i32 %x, %cttz
+  ret i32 %res
+}
+
+define i32 @shl_cttz_i32_zero_is_defined(i32 %x, i32 %y) {
+; RV32I-LABEL: shl_cttz_i32_zero_is_defined:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beqz a1, .LBB5_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    neg a2, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 30667
+; RV32I-NEXT:    addi a2, a2, 1329
+; RV32I-NEXT:    mul a1, a1, a2
+; RV32I-NEXT:    srli a1, a1, 27
+; RV32I-NEXT:    lui a2, %hi(.LCPI5_0)
+; RV32I-NEXT:    addi a2, a2, %lo(.LCPI5_0)
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB5_2:
+; RV32I-NEXT:    li a1, 32
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_i32_zero_is_defined:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a1, a1
+; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_i32_zero_is_defined:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    sext.w a2, a1
+; RV64I-NEXT:    beqz a2, .LBB5_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 30667
+; RV64I-NEXT:    addi a2, a2, 1329
+; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    srliw a1, a1, 27
+; RV64I-NEXT:    lui a2, %hi(.LCPI5_0)
+; RV64I-NEXT:    addi a2, a2, %lo(.LCPI5_0)
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB5_2:
+; RV64I-NEXT:    li a1, 32
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_i32_zero_is_defined:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    ctzw a1, a1
+; RV64ZBB-NEXT:    sllw a0, a0, a1
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+  %res = shl i32 %x, %cttz
+  ret i32 %res
+}
+
+define i32 @shl_cttz_constant_i32(i32 %y) {
+; RV32I-LABEL: shl_cttz_constant_i32:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_constant_i32:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 4
+; RV32ZBB-NEXT:    sll a0, a1, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_constant_i32:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addi a1, a1, 1329
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI6_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    li a1, 4
+; RV64I-NEXT:    sllw a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_constant_i32:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    ctzw a0, a0
+; RV64ZBB-NEXT:    li a1, 4
+; RV64ZBB-NEXT:    sllw a0, a1, a0
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 true)
+  %res = shl i32 4, %cttz
+  ret i32 %res
+}
+
+define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) {
+; RV32I-LABEL: shl_cttz_multiuse_i32:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    neg a2, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 30667
+; RV32I-NEXT:    addi a2, a2, 1329
+; RV32I-NEXT:    mul a1, a1, a2
+; RV32I-NEXT:    srli a1, a1, 27
+; RV32I-NEXT:    lui a2, %hi(.LCPI7_0)
+; RV32I-NEXT:    addi a2, a2, %lo(.LCPI7_0)
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    lbu s0, 0(a1)
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call use32
+; RV32I-NEXT:    sll a0, s1, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_multiuse_i32:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    addi sp, sp, -16
+; RV32ZBB-NEXT:    .cfi_def_cfa_offset 16
+; RV32ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32ZBB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32ZBB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32ZBB-NEXT:    .cfi_offset ra, -4
+; RV32ZBB-NEXT:    .cfi_offset s0, -8
+; RV32ZBB-NEXT:    .cfi_offset s1, -12
+; RV32ZBB-NEXT:    mv s0, a0
+; RV32ZBB-NEXT:    ctz s1, a1
+; RV32ZBB-NEXT:    mv a0, s1
+; RV32ZBB-NEXT:    call use32
+; RV32ZBB-NEXT:    sll a0, s0, s1
+; RV32ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32ZBB-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32ZBB-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32ZBB-NEXT:    addi sp, sp, 16
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_multiuse_i32:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    .cfi_def_cfa_offset 32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    negw a2, a1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 30667
+; RV64I-NEXT:    addi a2, a2, 1329
+; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    srliw a1, a1, 27
+; RV64I-NEXT:    lui a2, %hi(.LCPI7_0)
+; RV64I-NEXT:    addi a2, a2, %lo(.LCPI7_0)
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    lbu s0, 0(a1)
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call use32
+; RV64I-NEXT:    sllw a0, s1, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_multiuse_i32:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    addi sp, sp, -32
+; RV64ZBB-NEXT:    .cfi_def_cfa_offset 32
+; RV64ZBB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64ZBB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64ZBB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64ZBB-NEXT:    .cfi_offset ra, -8
+; RV64ZBB-NEXT:    .cfi_offset s0, -16
+; RV64ZBB-NEXT:    .cfi_offset s1, -24
+; RV64ZBB-NEXT:    mv s0, a0
+; RV64ZBB-NEXT:    ctzw s1, a1
+; RV64ZBB-NEXT:    mv a0, s1
+; RV64ZBB-NEXT:    call use32
+; RV64ZBB-NEXT:    sllw a0, s0, s1
+; RV64ZBB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    addi sp, sp, 32
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 true)
+  call void @use32(i32 %cttz)
+  %res = shl i32 %x, %cttz
+  ret i32 %res
+}
+
+define i64 @shl_cttz_i64(i64 %x, i64 %y) {
+; RV32I-LABEL: shl_cttz_i64:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a4, 30667
+; RV32I-NEXT:    addi a5, a4, 1329
+; RV32I-NEXT:    lui a4, %hi(.LCPI8_0)
+; RV32I-NEXT:    addi a4, a4, %lo(.LCPI8_0)
+; RV32I-NEXT:    bnez a2, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    neg a2, a3
+; RV32I-NEXT:    and a2, a3, a2
+; RV32I-NEXT:    mul a2, a2, a5
+; RV32I-NEXT:    srli a2, a2, 27
+; RV32I-NEXT:    add a2, a4, a2
+; RV32I-NEXT:    lbu a2, 0(a2)
+; RV32I-NEXT:    addi a4, a2, 32
+; RV32I-NEXT:    j .LBB8_3
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    neg a3, a2
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    mul a2, a2, a5
+; RV32I-NEXT:    srli a2, a2, 27
+; RV32I-NEXT:    add a2, a4, a2
+; RV32I-NEXT:    lbu a4, 0(a2)
+; RV32I-NEXT:  .LBB8_3: # %entry
+; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    sll a2, a0, a4
+; RV32I-NEXT:    bltz a3, .LBB8_5
+; RV32I-NEXT:  # %bb.4: # %entry
+; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    j .LBB8_6
+; RV32I-NEXT:  .LBB8_5:
+; RV32I-NEXT:    sll a1, a1, a4
+; RV32I-NEXT:    not a4, a4
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a4
+; RV32I-NEXT:    or a1, a1, a0
+; RV32I-NEXT:  .LBB8_6: # %entry
+; RV32I-NEXT:    srai a0, a3, 31
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_i64:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    bnez a2, .LBB8_2
+; RV32ZBB-NEXT:  # %bb.1: # %entry
+; RV32ZBB-NEXT:    ctz a2, a3
+; RV32ZBB-NEXT:    addi a4, a2, 32
+; RV32ZBB-NEXT:    j .LBB8_3
+; RV32ZBB-NEXT:  .LBB8_2:
+; RV32ZBB-NEXT:    ctz a4, a2
+; RV32ZBB-NEXT:  .LBB8_3: # %entry
+; RV32ZBB-NEXT:    addi a3, a4, -32
+; RV32ZBB-NEXT:    sll a2, a0, a4
+; RV32ZBB-NEXT:    bltz a3, .LBB8_5
+; RV32ZBB-NEXT:  # %bb.4: # %entry
+; RV32ZBB-NEXT:    mv a1, a2
+; RV32ZBB-NEXT:    j .LBB8_6
+; RV32ZBB-NEXT:  .LBB8_5:
+; RV32ZBB-NEXT:    sll a1, a1, a4
+; RV32ZBB-NEXT:    not a4, a4
+; RV32ZBB-NEXT:    srli a0, a0, 1
+; RV32ZBB-NEXT:    srl a0, a0, a4
+; RV32ZBB-NEXT:    or a1, a1, a0
+; RV32ZBB-NEXT:  .LBB8_6: # %entry
+; RV32ZBB-NEXT:    srai a0, a3, 31
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_i64:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    neg a2, a1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    mul a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_i64:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    ctz a1, a1
+; RV64ZBB-NEXT:    sll a0, a0, a1
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i64 @llvm.cttz.i64(i64 %y, i1 true)
+  %res = shl i64 %x, %cttz
+  ret i64 %res
+}
+
+define i64 @shl_cttz_constant_i64(i64 %y) {
+; RV32I-LABEL: shl_cttz_constant_i64:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a2, 30667
+; RV32I-NEXT:    addi a3, a2, 1329
+; RV32I-NEXT:    lui a2, %hi(.LCPI9_0)
+; RV32I-NEXT:    addi a2, a2, %lo(.LCPI9_0)
+; RV32I-NEXT:    bnez a0, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    mul a0, a0, a3
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    addi a1, a0, 32
+; RV32I-NEXT:    j .LBB9_3
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    mul a0, a0, a3
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    lbu a1, 0(a0)
+; RV32I-NEXT:  .LBB9_3: # %entry
+; RV32I-NEXT:    li a0, 4
+; RV32I-NEXT:    addi a2, a1, -32
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    bltz a2, .LBB9_5
+; RV32I-NEXT:  # %bb.4: # %entry
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    j .LBB9_6
+; RV32I-NEXT:  .LBB9_5:
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    li a3, 2
+; RV32I-NEXT:    srl a1, a3, a1
+; RV32I-NEXT:  .LBB9_6: # %entry
+; RV32I-NEXT:    srai a2, a2, 31
+; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: shl_cttz_constant_i64:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    bnez a0, .LBB9_2
+; RV32ZBB-NEXT:  # %bb.1: # %entry
+; RV32ZBB-NEXT:    ctz a0, a1
+; RV32ZBB-NEXT:    addi a1, a0, 32
+; RV32ZBB-NEXT:    j .LBB9_3
+; RV32ZBB-NEXT:  .LBB9_2:
+; RV32ZBB-NEXT:    ctz a1, a0
+; RV32ZBB-NEXT:  .LBB9_3: # %entry
+; RV32ZBB-NEXT:    li a0, 4
+; RV32ZBB-NEXT:    addi a2, a1, -32
+; RV32ZBB-NEXT:    sll a0, a0, a1
+; RV32ZBB-NEXT:    bltz a2, .LBB9_5
+; RV32ZBB-NEXT:  # %bb.4: # %entry
+; RV32ZBB-NEXT:    mv a1, a0
+; RV32ZBB-NEXT:    j .LBB9_6
+; RV32ZBB-NEXT:  .LBB9_5:
+; RV32ZBB-NEXT:    not a1, a1
+; RV32ZBB-NEXT:    li a3, 2
+; RV32ZBB-NEXT:    srl a1, a3, a1
+; RV32ZBB-NEXT:  .LBB9_6: # %entry
+; RV32ZBB-NEXT:    srai a2, a2, 31
+; RV32ZBB-NEXT:    and a0, a2, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV64I-LABEL: shl_cttz_constant_i64:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: shl_cttz_constant_i64:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    ctz a0, a0
+; RV64ZBB-NEXT:    li a1, 4
+; RV64ZBB-NEXT:    sll a0, a1, a0
+; RV64ZBB-NEXT:    ret
+entry:
+  %cttz = call i64 @llvm.cttz.i64(i64 %y, i1 true)
+  %res = shl i64 4, %cttz
+  ret i64 %res
+}
+
+declare void @use32(i32 signext)
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index 87d69bfad38c2..d3e495bb723ad 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -56,12 +56,12 @@ define void @caller_indirect_tail(i32 %a) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    beqz a0, .LBB3_2
 ; CHECK-NEXT:  # %bb.1: # %entry
-; CHECK-NEXT:    lui a0, %hi(callee_indirect2)
-; CHECK-NEXT:    addi t1, a0, %lo(callee_indirect2)
+; CHECK-NEXT:    lui t1, %hi(callee_indirect2)
+; CHECK-NEXT:    addi t1, t1, %lo(callee_indirect2)
 ; CHECK-NEXT:    jr t1
 ; CHECK-NEXT:  .LBB3_2:
-; CHECK-NEXT:    lui a0, %hi(callee_indirect1)
-; CHECK-NEXT:    addi t1, a0, %lo(callee_indirect1)
+; CHECK-NEXT:    lui t1, %hi(callee_indirect1)
+; CHECK-NEXT:    addi t1, t1, %lo(callee_indirect1)
 ; CHECK-NEXT:    jr t1
 
 
diff --git a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
index 2fd4572d23456..6530736304837 100644
--- a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
+++ b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
@@ -10,36 +10,30 @@
 define signext i32 @unroll_loop_cse() {
 ; CHECK-LABEL: unroll_loop_cse:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(x)
-; CHECK-NEXT:    lw a3, %lo(x)(a1)
-; CHECK-NEXT:    lui a2, %hi(check)
-; CHECK-NEXT:    lw a4, %lo(check)(a2)
+; CHECK-NEXT:    lui a0, %hi(x)
+; CHECK-NEXT:    lw a1, %lo(x)(a0)
+; CHECK-NEXT:    lui a0, %hi(check)
+; CHECK-NEXT:    lw a2, %lo(check)(a0)
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    bne a3, a4, .LBB0_6
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    addi a1, a1, %lo(x)
-; CHECK-NEXT:    lw a1, 4(a1)
-; CHECK-NEXT:    addi a2, a2, %lo(check)
-; CHECK-NEXT:    lw a2, 4(a2)
 ; CHECK-NEXT:    bne a1, a2, .LBB0_6
-; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a1, %hi(x)
 ; CHECK-NEXT:    addi a1, a1, %lo(x)
-; CHECK-NEXT:    lw a3, 8(a1)
+; CHECK-NEXT:    lw a3, 4(a1)
 ; CHECK-NEXT:    lui a2, %hi(check)
 ; CHECK-NEXT:    addi a2, a2, %lo(check)
+; CHECK-NEXT:    lw a4, 4(a2)
+; CHECK-NEXT:    bne a3, a4, .LBB0_6
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    lw a3, 8(a1)
 ; CHECK-NEXT:    lw a4, 8(a2)
 ; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    lw a1, 12(a1)
-; CHECK-NEXT:    lw a2, 12(a2)
-; CHECK-NEXT:    bne a1, a2, .LBB0_6
+; CHECK-NEXT:    lw a3, 12(a1)
+; CHECK-NEXT:    lw a4, 12(a2)
+; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.4:
-; CHECK-NEXT:    lui a1, %hi(x)
-; CHECK-NEXT:    addi a1, a1, %lo(x)
 ; CHECK-NEXT:    lw a3, 16(a1)
-; CHECK-NEXT:    lui a2, %hi(check)
-; CHECK-NEXT:    addi a2, a2, %lo(check)
 ; CHECK-NEXT:    lw a4, 16(a2)
 ; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.5:
diff --git a/llvm/test/CodeGen/RISCV/vlenb.ll b/llvm/test/CodeGen/RISCV/vlenb.ll
index 1d6c1b5d1acbd..26d4f99c3b979 100644
--- a/llvm/test/CodeGen/RISCV/vlenb.ll
+++ b/llvm/test/CodeGen/RISCV/vlenb.ll
@@ -71,10 +71,13 @@ define void @machine_licm() {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -4
+; CHECK-NEXT:    .cfi_offset s0, -8
+; CHECK-NEXT:    csrr s0, vlenb
 ; CHECK-NEXT:  .LBB4_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    mv a0, s0
 ; CHECK-NEXT:    call use
 ; CHECK-NEXT:    j .LBB4_1
 entry:
diff --git a/llvm/test/CodeGen/SPIRV/event-zero-const.ll b/llvm/test/CodeGen/SPIRV/event-zero-const.ll
new file mode 100644
index 0000000000000..b40456d233f12
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/event-zero-const.ll
@@ -0,0 +1,23 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#LongTy:]] = OpTypeInt 64 0
+; CHECK: %[[#EventTy:]] = OpTypeEvent
+; CHECK: %[[#LongNull:]] = OpConstantNull %[[#LongTy]]
+; CHECK: %[[#EventNull:]] = OpConstantNull %[[#EventTy]]
+; CHECK: OpFunction
+; CHECK: OpINotEqual %[[#]] %[[#]] %[[#LongNull]]
+; CHECK: OpGroupAsyncCopy %[[#EventTy]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#EventNull]]
+
+
+define weak_odr dso_local spir_kernel void @foo(i64 %_arg_i, ptr addrspace(1) %_arg_ptr, ptr addrspace(3) %_arg_local) {
+entry:
+  %r1 = icmp ne i64 %_arg_i, 0
+  %e1 = tail call spir_func target("spirv.Event") @__spirv_GroupAsyncCopy(i32 2, ptr addrspace(3) %_arg_local, ptr addrspace(1) %_arg_ptr, i64 1, i64 1, target("spirv.Event") zeroinitializer)
+  ret void
+}
+
+declare dso_local spir_func target("spirv.Event") @__spirv_GroupAsyncCopy(i32, ptr addrspace(3), ptr addrspace(1), i64, i64, target("spirv.Event"))
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll
new file mode 100644
index 0000000000000..359f6d1c0f8e5
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/basic-load-store.ll
@@ -0,0 +1,53 @@
+; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_cache_controls
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: Capability CacheControlsINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_cache_controls"
+; CHECK-SPIRV-DAG: OpName %[[#GVar:]] "G"
+; CHECK-SPIRV-DAG: OpName %[[#Arg:]] "buffer"
+; CHECK-SPIRV-DAG: OpDecorate %[[#GVar]] CacheControlStoreINTEL 0 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#GVar]] CacheControlStoreINTEL 1 3
+; CHECK-SPIRV-DAG: OpDecorate %[[#Arg]] CacheControlLoadINTEL 0 0
+; CHECK-SPIRV-DAG: OpDecorate %[[#Arg]] CacheControlStoreINTEL 0 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#LoadPtr:]] CacheControlLoadINTEL 0 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#LoadPtr]] CacheControlLoadINTEL 1 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#StorePtr:]] CacheControlStoreINTEL 0 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#StorePtr]] CacheControlStoreINTEL 1 2
+; CHECK-SPIRV: OpLoad %[[#]] %[[#LoadPtr]]
+; CHECK-SPIRV: OpStore %[[#StorePtr]] %[[#]]
+
+@G = common addrspace(1) global i32 0, align 4, !spirv.Decorations !9
+
+define spir_kernel void @test(ptr addrspace(1) %dummy, ptr addrspace(1) %buffer) !spirv.ParameterDecorations !12 {
+entry:
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %buffer, i64 1, !spirv.Decorations !3
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr addrspace(1) %buffer, i64 0, !spirv.Decorations !6
+  store i32 %0, ptr addrspace(1) %arrayidx1, align 4
+  ret void
+}
+
+!spirv.MemoryModel = !{!0}
+!spirv.Source = !{!1}
+!opencl.spir.version = !{!2}
+!opencl.ocl.version = !{!2}
+
+!0 = !{i32 2, i32 2}
+!1 = !{i32 3, i32 102000}
+!2 = !{i32 1, i32 2}
+!3 = !{!4, !5}
+!4 = !{i32 6442, i32 0, i32 1}  ; {CacheControlLoadINTEL, CacheLevel=0, Cached}
+!5 = !{i32 6442, i32 1, i32 1}  ; {CacheControlLoadINTEL, CacheLevel=1, Cached}
+!6 = !{!7, !8}
+!7 = !{i32 6443, i32 0, i32 1}  ; {CacheControlStoreINTEL, CacheLevel=0, WriteThrough}
+!8 = !{i32 6443, i32 1, i32 2}  ; {CacheControlStoreINTEL, CacheLevel=1, WriteBack}
+!9 = !{!10, !11}
+!10 = !{i32 6443, i32 0, i32 1}  ; {CacheControlStoreINTEL, CacheLevel=0, WriteThrough}
+!11 = !{i32 6443, i32 1, i32 3}  ; {CacheControlStoreINTEL, CacheLevel=1, Streaming}
+!12 = !{!13, !14}
+!13 = !{}
+!14 = !{!15, !16}
+!15 = !{i32 6442, i32 0, i32 0}  ; {CacheControlLoadINTEL,   CacheLevel=0, Uncached}
+!16 = !{i32 6443, i32 0, i32 1}  ; {CacheControlStoreINTEL,  CacheLevel=0, WriteThrough}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll
new file mode 100644
index 0000000000000..9a13b720f61f7
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll
@@ -0,0 +1,44 @@
+; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_cache_controls
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_cache_controls %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: Capability CacheControlsINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_cache_controls"
+
+; CHECK-SPIRV-DAG: OpName %[[#Ptr1:]] "ptr1"
+; CHECK-SPIRV-DAG: OpName %[[#Ptr2:]] "ptr2"
+; CHECK-SPIRV-DAG: OpName %[[#Ptr3:]] "ptr3"
+; CHECK-SPIRV-DAG: OpDecorate %[[#Ptr1]] CacheControlLoadINTEL 0 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#Ptr2]] CacheControlLoadINTEL 1 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#Ptr3]] CacheControlStoreINTEL 2 3
+; CHECK-SPIRV: OpExtInst %[[#]] %[[#]] prefetch %[[#Ptr1]] %[[#]]
+; CHECK-SPIRV: OpExtInst %[[#]] %[[#]] prefetch %[[#Ptr2]] %[[#]]
+; CHECK-SPIRV: OpExtInst %[[#]] %[[#]] prefetch %[[#Ptr3]] %[[#]]
+
+; 6442 stands for CacheControlLoadINTEL token
+@.str.1 = private unnamed_addr addrspace(1) constant [16 x i8] c"../prefetch.hpp\00", section "llvm.metadata"
+@.str.9 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\220,1\22}\00", section "llvm.metadata"
+@.str.10 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\221,1\22}\00", section "llvm.metadata"
+@.str.11 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6443:\222,3\22}\00", section "llvm.metadata"
+
+define weak_odr dso_local spir_kernel void @foo(ptr addrspace(1) noundef align 1 %_arg_dataPtr) {
+entry:
+  %r0 = addrspacecast ptr addrspace(1) %_arg_dataPtr to ptr addrspace(4)
+  %ptr1 = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %r0, i32 noundef 5)
+  %r1 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %ptr1, ptr addrspace(1) @.str.9, ptr addrspace(1) @.str.1, i32 76, ptr addrspace(1) null)
+  tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %r1, i64 noundef 1)
+  %arrayidx3.i = getelementptr inbounds i8, ptr addrspace(4) %r0, i64 1
+  %ptr2 = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %arrayidx3.i, i32 noundef 5)
+  %r2 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %ptr2, ptr addrspace(1) @.str.10, ptr addrspace(1) @.str.1, i32 80, ptr addrspace(1) null)
+  tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %r2, i64 noundef 1)
+  %arrayidx7.i = getelementptr inbounds i8, ptr addrspace(4) %r0, i64 2
+  %ptr3 = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %arrayidx7.i, i32 noundef 5)
+  %r3 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %ptr3, ptr addrspace(1) @.str.11, ptr addrspace(1) @.str.1, i32 80, ptr addrspace(1) null)
+  tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %r3, i64 noundef 2)
+  ret void
+}
+
+declare ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), i32, ptr addrspace(1))
+declare dso_local spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef, i64 noundef)
+declare dso_local spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef, i32 noundef)
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll
new file mode 100644
index 0000000000000..40008873bf19b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_fpga_decorations/global-var-decorations.ll
@@ -0,0 +1,33 @@
+; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_global_variable_fpga_decorations
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_fpga_decorations %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_fpga_decorations %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: Capability GlobalVariableFPGADecorationsINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_global_variable_fpga_decorations"
+; CHECK-SPIRV-DAG: OpName %[[#G1:]] "int_var"
+; CHECK-SPIRV-DAG: OpName %[[#G2:]] "float_var"
+; CHECK-SPIRV-DAG: OpName %[[#G3:]] "bool_var"
+; CHECK-SPIRV-DAG: OpDecorate %[[#G1]] ImplementInRegisterMapINTEL 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#G1]] InitModeINTEL 0
+; CHECK-SPIRV-DAG: OpDecorate %[[#G2]] ImplementInRegisterMapINTEL 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#G2]] InitModeINTEL 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] ImplementInRegisterMapINTEL 0
+; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] InitModeINTEL 0
+
+@int_var = addrspace(1) global i32 42, !spirv.Decorations !1
+@float_var = addrspace(1) global float 1.0, !spirv.Decorations !5
+@bool_var = addrspace(1) global i1 0, !spirv.Decorations !7
+
+define spir_kernel void @test() {
+entry:
+  ret void
+}
+
+!1 = !{!2, !3}
+!2 = !{i32 6191, i1 true} ; ImplementInRegisterMapINTEL = true
+!3 = !{i32 6190, i32 0} ; InitModeINTEL = 0
+!4 = !{i32 6190, i32 1} ; InitModeINTEL = 1
+!5 = !{!2, !4}
+!6 = !{i32 6191, i1 false} ; ImplementInRegisterMapINTEL = false
+!7 = !{!6, !3}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll
new file mode 100644
index 0000000000000..1397435efb2d4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_global_variable_host_access/global-var-host-access.ll
@@ -0,0 +1,34 @@
+; Adapted from https://github.com/KhronosGroup/SPIRV-LLVM-Translator/tree/main/test/extensions/INTEL/SPV_INTEL_global_variable_host_access
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_host_access,+SPV_INTEL_global_variable_fpga_decorations %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_global_variable_host_access,+SPV_INTEL_global_variable_fpga_decorations %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: Capability GlobalVariableHostAccessINTEL
+; CHECK-SPIRV-DAG: Capability GlobalVariableFPGADecorationsINTEL
+; CHECK-SPIRV-DAG: Extension "SPV_INTEL_global_variable_host_access"
+; CHECK-SPIRV-DAG: Extension "SPV_INTEL_global_variable_fpga_decorations"
+
+; CHECK-SPIRV-DAG: OpName %[[#G1:]] "int_var"
+; CHECK-SPIRV-DAG: OpName %[[#G2:]] "bool_var"
+; CHECK-SPIRV-DAG: OpName %[[#G3:]] "float_var"
+; CHECK-SPIRV-DAG: OpDecorate %[[#G1]] HostAccessINTEL 1 "IntVarName"
+; CHECK-SPIRV-DAG: OpDecorate %[[#G2]] HostAccessINTEL 3 "BoolVarName"
+; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] ImplementInRegisterMapINTEL 1
+; CHECK-SPIRV-DAG: OpDecorate %[[#G3]] InitModeINTEL 1
+
+@int_var = addrspace(1) global i32 42, !spirv.Decorations !1
+@bool_var = addrspace(1) global i1 0, !spirv.Decorations !4
+@float_var = addrspace(1) global float 1.0, !spirv.Decorations !5
+
+define spir_kernel void @test() {
+entry:
+  ret void
+}
+
+!1 = !{!2}
+!2 = !{i32 6188, i32 1, !"IntVarName"} ; HostAccessINTEL 1 "IntVarName"
+!3 = !{i32 6188, i32 3, !"BoolVarName"} ; HostAccessINTEL 3 "BoolVarName"
+!4 = !{!3}
+!5 = !{!6, !7}
+!6 = !{i32 6191, i1 true} ; ImplementInRegisterMapINTEL = true
+!7 = !{i32 6190, i32 1} ; InitModeINTEL = 1
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll
new file mode 100644
index 0000000000000..06f1d0bf7fd37
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ptr-annotation.ll
@@ -0,0 +1,41 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpName %[[#Foo:]] "foo"
+; CHECK-DAG: OpName %[[#Ptr1:]] "_arg1"
+; CHECK-DAG: OpName %[[#Ptr2:]] "_arg2"
+; CHECK-DAG: OpName %[[#Ptr3:]] "_arg3"
+; CHECK-DAG: OpName %[[#Ptr4:]] "_arg4"
+; CHECK-DAG: OpName %[[#Ptr5:]] "_arg5"
+; CHECK-DAG: OpDecorate %[[#Ptr1]] NonReadable
+; CHECK-DAG: OpDecorate %[[#Ptr2]] Alignment 128
+; CHECK-DAG: OpDecorate %[[#Ptr2]] NonReadable
+; CHECK-DAG: OpDecorate %[[#Ptr3]] Alignment 128
+; CHECK-DAG: OpDecorate %[[#Ptr3]] NonReadable
+; CHECK-DAG: OpDecorate %[[#Ptr4]] Alignment 128
+; CHECK-DAG: OpDecorate %[[#Ptr4]] NonReadable
+; CHECK-DAG: OpDecorate %[[#Ptr5]] UserSemantic "Unknown format"
+; CHECK: %[[#Foo]] = OpFunction
+; CHECK-NEXT: %[[#Ptr1]] = OpFunctionParameter
+; CHECK-NEXT: %[[#Ptr2]] = OpFunctionParameter
+; CHECK-NEXT: %[[#Ptr3]] = OpFunctionParameter
+; CHECK-NEXT: %[[#Ptr4]] = OpFunctionParameter
+; CHECK-NEXT: %[[#Ptr5]] = OpFunctionParameter
+; CHECK: OpFunctionEnd
+
+@.str.0 = private unnamed_addr addrspace(1) constant [16 x i8] c"../prefetch.hpp\00", section "llvm.metadata"
+@.str.1 = private unnamed_addr addrspace(1) constant [5 x i8] c"{25}\00", section "llvm.metadata"
+@.str.2 = private unnamed_addr addrspace(1) constant [13 x i8] c"{44:128}{25}\00", section "llvm.metadata"
+@.str.3 = private unnamed_addr addrspace(1) constant [15 x i8] c"{44:\22128\22}{25}\00", section "llvm.metadata"
+@.str.4 = private unnamed_addr addrspace(1) constant [13 x i8] c"{44,128}{25}\00", section "llvm.metadata"
+@.str.5 = private unnamed_addr addrspace(1) constant [15 x i8] c"Unknown format\00", section "llvm.metadata"
+
+define spir_kernel void @foo(ptr addrspace(1) %_arg1, ptr addrspace(1) %_arg2, ptr addrspace(1) %_arg3, ptr addrspace(1) %_arg4, ptr addrspace(1) %_arg5) {
+entry:
+  %r1 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg1, ptr addrspace(1) @.str.1, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null)
+  %r2 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg2, ptr addrspace(1) @.str.2, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null)
+  %r3 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg3, ptr addrspace(1) @.str.3, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null)
+  %r4 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg4, ptr addrspace(1) @.str.4, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null)
+  %r5 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %_arg5, ptr addrspace(1) @.str.5, ptr addrspace(1) @.str.0, i32 80, ptr addrspace(1) null)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll b/llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll
new file mode 100644
index 0000000000000..471ab03ed89f6
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/phi-spvintrinsic-dominate.ll
@@ -0,0 +1,39 @@
+; The goal of the test is to check that newly inserted internal (spv)
+; intrinsic functions for PHI's operands are inserted at the correct
+; positions, and don't break rules of instruction domination and PHI nodes
+; grouping at top of basic block.
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpFunction
+; CHECK: OpBranch
+; CHECK: OpLabel
+; CHECK: OpPhi
+; CHECK: OpPhi
+; CHECK: OpPhi
+
+define spir_kernel void @foo(ptr addrspace(1) %_arg1) {
+entry:
+  br label %l1
+
+l1:
+  %sw = phi <4 x double> [ %vec, %l2 ], [ <double 0.0, double 0.0, double 0.0, double poison>, %entry ]
+  %in = phi <3 x double> [ %ins, %l2 ], [ zeroinitializer, %entry ]
+  %r1 = phi i32 [ %r2, %l2 ], [ 0, %entry ]
+  %c1 = icmp ult i32 %r1, 3
+  br i1 %c1, label %l2, label %exit
+
+l2:
+  %r3 = zext nneg i32 %r1 to i64
+  %r4 = getelementptr inbounds double, ptr addrspace(1) %_arg1, i64 %r3
+  %r5 = load double, ptr addrspace(1) %r4, align 8
+  %ins = insertelement <3 x double> %in, double %r5, i32 %r1
+  %exp = shufflevector <3 x double> %ins, <3 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  %vec = shufflevector <4 x double> %exp, <4 x double> %sw, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %r2 = add nuw nsw i32 %r1, 1
+  br label %l1
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll
new file mode 100644
index 0000000000000..f9b3757bb6d2c
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
+
+; This loop has a vpt block that should not block tailpredication
+define void @convert_vptblock(ptr %pchTarget, i16 signext %iTargetStride, ptr %pwLineMask, ptr %ptCopySize, i8 zeroext %chColour, i8 zeroext %chOpacity) {
+; CHECK-LABEL: convert_vptblock:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldrsh.w r12, [r3, #2]
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB0_1: @ %for.body.lr.ph
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    ldrsh.w r10, [r3]
+; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    ldrd r4, r5, [sp, #88]
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    cmp.w r10, #8
+; CHECK-NEXT:    mov.w r0, #1
+; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    it ge
+; CHECK-NEXT:    movge r3, #8
+; CHECK-NEXT:    vidup.u16 q0, r8, #4
+; CHECK-NEXT:    sub.w r3, r10, r3
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    adds r3, #7
+; CHECK-NEXT:    vmov.i16 q2, #0x100
+; CHECK-NEXT:    vmov.i16 q3, #0xff
+; CHECK-NEXT:    add.w r9, r0, r3, lsr #3
+; CHECK-NEXT:  .LBB0_2: @ %for.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
+; CHECK-NEXT:    mov r3, r10
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    mov r6, r8
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    dls lr, r9
+; CHECK-NEXT:  .LBB0_3: @ %do.body
+; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vctp.16 r3
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u16 q5, [r2, q4]
+; CHECK-NEXT:    vmul.i16 q4, q5, r5
+; CHECK-NEXT:    vshr.u16 q4, q4, #8
+; CHECK-NEXT:    vsub.i16 q5, q2, q4
+; CHECK-NEXT:    vpt.i16 eq, q4, q3
+; CHECK-NEXT:    vmovt q5, q1
+; CHECK-NEXT:    vctp.16 r3
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u16 q6, [r0]
+; CHECK-NEXT:    vsub.i16 q4, q2, q5
+; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vmul.i16 q5, q5, q6
+; CHECK-NEXT:    vmla.i16 q5, q4, r4
+; CHECK-NEXT:    vshr.u16 q4, q5, #8
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrbt.16 q4, [r0], #8
+; CHECK-NEXT:    vidup.u16 q4, r6, #4
+; CHECK-NEXT:    le lr, .LBB0_3
+; CHECK-NEXT:  @ %bb.4: @ %do.end
+; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    add.w r0, r11, #1
+; CHECK-NEXT:    add r7, r1
+; CHECK-NEXT:    sxth.w r11, r0
+; CHECK-NEXT:    cmp r11, r12
+; CHECK-NEXT:    blt .LBB0_2
+; CHECK-NEXT:  @ %bb.5:
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    bx lr
+entry:
+  %iHeight1 = getelementptr inbounds i8, ptr %ptCopySize, i32 2
+  %0 = load i16, ptr %iHeight1, align 2
+  %cmp28 = icmp sgt i16 %0, 0
+  br i1 %cmp28, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %1 = load i16, ptr %ptCopySize, align 2
+  %conv5 = sext i16 %1 to i32
+  %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 0, i32 4)
+  %conv6 = zext i8 %chOpacity to i16
+  %.splatinsert = insertelement <8 x i16> poison, i16 %conv6, i64 0
+  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer
+  %conv7 = zext i8 %chColour to i16
+  %.splatinsert.i = insertelement <8 x i16> poison, i16 %conv7, i64 0
+  %.splat.i = shufflevector <8 x i16> %.splatinsert.i, <8 x i16> poison, <8 x i32> zeroinitializer
+  %conv11 = sext i16 %iTargetStride to i32
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %do.end, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %do.end
+  %pchTarget.addr.030 = phi ptr [ %pchTarget, %for.body.lr.ph ], [ %add.ptr12, %do.end ]
+  %y.029 = phi i16 [ 0, %for.body.lr.ph ], [ %inc, %do.end ]
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %for.body
+  %blkCnt.0 = phi i32 [ %conv5, %for.body ], [ %sub8, %do.body ]
+  %.pn = phi { <8 x i16>, i32 } [ %2, %for.body ], [ %13, %do.body ]
+  %pchTargetLine.0 = phi ptr [ %pchTarget.addr.030, %for.body ], [ %add.ptr, %do.body ]
+  %vStride4Offs.0 = extractvalue { <8 x i16>, i32 } %.pn, 0
+  %incr.0 = extractvalue { <8 x i16>, i32 } %.pn, 1
+  %3 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.0)
+  %4 = tail call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr %pwLineMask, <8 x i16> %vStride4Offs.0, i32 8, i32 0, i32 1, <8 x i1> %3)
+  %5 = mul <8 x i16> %4, %.splat
+  %shr = lshr <8 x i16> %5, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %6 = icmp eq <8 x i16> %shr, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %7 = sub nuw nsw <8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, %shr
+  %sub = select <8 x i1> %6, <8 x i16> zeroinitializer, <8 x i16> %7
+  %8 = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %pchTargetLine.0, i32 1, <8 x i1> %3, <8 x i8> zeroinitializer)
+  %9 = zext <8 x i8> %8 to <8 x i16>
+  %sub.i = sub nsw <8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, %sub
+  %10 = mul <8 x i16> %sub.i, %.splat.i
+  %11 = mul <8 x i16> %sub, %9
+  %add.i = add <8 x i16> %10, %11
+  %shr.i = lshr <8 x i16> %add.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %12 = trunc nuw <8 x i16> %shr.i to <8 x i8>
+  tail call void @llvm.masked.store.v8i8.p0(<8 x i8> %12, ptr %pchTargetLine.0, i32 1, <8 x i1> %3)
+  %13 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %incr.0, i32 4)
+  %add.ptr = getelementptr inbounds i8, ptr %pchTargetLine.0, i32 8
+  %sub8 = add nsw i32 %blkCnt.0, -8
+  %cmp9 = icmp sgt i32 %blkCnt.0, 8
+  br i1 %cmp9, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  %add.ptr12 = getelementptr inbounds i8, ptr %pchTarget.addr.030, i32 %conv11
+  %inc = add nuw nsw i16 %y.029, 1
+  %cmp = icmp slt i16 %inc, %0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; This loop has an else predicate on the vqshl, which is not very realistic but
+; prevents us from converting to a vptblock without being able to remove it.
+define i32 @else(ptr %s1, ptr %s2, i32 %x, ptr %d, i32 %n) {
+; CHECK-LABEL: else:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    ldr r2, [sp, #8]
+; CHECK-NEXT:    cmp r2, #4
+; CHECK-NEXT:    mov r3, r2
+; CHECK-NEXT:    it ge
+; CHECK-NEXT:    movge r3, #4
+; CHECK-NEXT:    subs r3, r2, r3
+; CHECK-NEXT:    add.w r12, r3, #3
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    add.w r12, r3, r12, lsr #2
+; CHECK-NEXT:    movs r3, #98
+; CHECK-NEXT:    dls lr, r12
+; CHECK-NEXT:  .LBB1_1: @ %do.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vctp.32 r2
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r1], #16
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vpstet
+; CHECK-NEXT:    vqdmlsdht.s32 q2, q1, q0
+; CHECK-NEXT:    vqshle.u32 q2, r3
+; CHECK-NEXT:    vstrwt.32 q2, [r0], #16
+; CHECK-NEXT:    le lr, .LBB1_1
+; CHECK-NEXT:  @ %bb.2: @ %do.end
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %sub, %do.body ]
+  %s2.addr.0 = phi ptr [ %s2, %entry ], [ %add.ptr1, %do.body ]
+  %s1.addr.0 = phi ptr [ %s1, %entry ], [ %add.ptr, %do.body ]
+  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.0)
+  %1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s1.addr.0, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %s2.addr.0, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+  %3 = tail call <4 x i32> @llvm.arm.mve.vqdmlad.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %2, <4 x i32> %1, i32 0, i32 0, i32 1, <4 x i1> %0)
+  %4 = xor <4 x i1> %0, <i1 true, i1 true, i1 true, i1 true>
+  %5 = tail call <4 x i32> @llvm.arm.mve.vshl.scalar.predicated.v4i32.v4i1(<4 x i32> %3, i32 98, i32 1, i32 0, i32 1, <4 x i1> %4)
+  tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %5, ptr %s1.addr.0, i32 4, <4 x i1> %0)
+  %add.ptr = getelementptr inbounds i8, ptr %s1.addr.0, i32 16
+  %add.ptr1 = getelementptr inbounds i8, ptr %s2.addr.0, i32 16
+  %sub = add nsw i32 %n.addr.0, -4
+  %cmp = icmp sgt i32 %n.addr.0, 4
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
similarity index 100%
rename from llvm/test/CodeGen/WebAssembly/cfg-stackify-eh.ll
rename to llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
diff --git a/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
new file mode 100644
index 0000000000000..52a6364e12258
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/eh-option-errors.ll
@@ -0,0 +1,22 @@
+target triple = "wasm32-unknown-unknown"
+
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-eh 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_EH
+; EM_EH_W_WASM_EH: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-eh
+
+; RUN: not --crash llc < %s -enable-emscripten-sjlj -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_SJLJ_W_WASM_SJLJ
+; EM_SJLJ_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-sjlj not allowed with -wasm-enable-sjlj
+
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -wasm-enable-sjlj 2>&1 | FileCheck %s --check-prefix=EM_EH_W_WASM_SJLJ
+; EM_EH_W_WASM_SJLJ: LLVM ERROR: -enable-emscripten-cxx-exceptions not allowed with -wasm-enable-sjlj
+
+; RUN: not --crash llc < %s -wasm-enable-exnref 2>&1 | FileCheck %s --check-prefix=WASM_EXNREF_ONLY
+; WASM_EXNREF_ONLY: LLVM ERROR: -wasm-enable-exnref should be used with -wasm-enable-eh
+
+; RUN: not --crash llc < %s -wasm-enable-eh -exception-model=dwarf 2>&1 | FileCheck %s --check-prefix=EH_MODEL_DWARF
+; EH_MODEL_DWARF: LLVM ERROR: -exception-model should be either 'none' or 'wasm'
+
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -exception-model=wasm 2>&1 | FileCheck %s --check-prefix=EM_EH_W_MODEL_WASM
+; EM_EH_W_MODEL_WASM: LLVM ERROR: -exception-model=wasm not allowed with -enable-emscripten-cxx-exceptions
+
+; RUN: not --crash llc < %s -exception-model=wasm 2>&1 | FileCheck %s --check-prefix=MODEL_WASM_WO_WASM_EH_SJLJ
+; MODEL_WASM_WO_WASM_EH_SJLJ: LLVM ERROR: -exception-model=wasm only allowed with at least one of -wasm-enable-eh or -wasm-enable-sjlj
diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll
similarity index 100%
rename from llvm/test/CodeGen/WebAssembly/exception.ll
rename to llvm/test/CodeGen/WebAssembly/exception-legacy.ll
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index d9d3f6be800fd..73ccea8d652db 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -35,3 +35,71 @@ define float @extract_lane_v8f16(<8 x half> %v) {
   %r = call float @llvm.wasm.extract.lane.f16x8(<8 x half> %v, i32 1)
   ret float %r
 }
+
+; CHECK-LABEL: add_v8f16:
+; CHECK:       f16x8.add $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @add_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fadd <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: sub_v8f16:
+; CHECK:       f16x8.sub $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @sub_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fsub <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: mul_v8f16:
+; CHECK:       f16x8.mul $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @mul_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fmul <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: div_v8f16:
+; CHECK:       f16x8.div $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+define <8 x half> @div_v8f16(<8 x half> %a, <8 x half> %b) {
+  %r = fdiv <8 x half> %a, %b
+  ret <8 x half> %r
+}
+
+; CHECK-LABEL: min_intrinsic_v8f16:
+; CHECK:       f16x8.min $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @min_intrinsic_v8f16(<8 x half> %x, <8 x half> %y) {
+  %a = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y)
+  ret <8 x half> %a
+}
+
+; CHECK-LABEL: max_intrinsic_v8f16:
+; CHECK:       f16x8.max $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @max_intrinsic_v8f16(<8 x half> %x, <8 x half> %y) {
+  %a = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y)
+  ret <8 x half> %a
+}
+
+; CHECK-LABEL: pmin_intrinsic_v8f16:
+; CHECK:       f16x8.pmin $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.wasm.pmin.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @pmin_intrinsic_v8f16(<8 x half> %a, <8 x half> %b) {
+  %v = call <8 x half> @llvm.wasm.pmin.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %v
+}
+
+; CHECK-LABEL: pmax_intrinsic_v8f16:
+; CHECK:       f16x8.pmax $push0=, $0, $1
+; CHECK-NEXT:  return $pop0
+declare <8 x half> @llvm.wasm.pmax.v8f16(<8 x half>, <8 x half>)
+define <8 x half> @pmax_intrinsic_v8f16(<8 x half> %a, <8 x half> %b) {
+  %v = call <8 x half> @llvm.wasm.pmax.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %v
+}
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll
index 4a63c812d6ae9..66872a5422986 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll
@@ -1,7 +1,6 @@
 ; RUN: llc < %s -enable-emscripten-cxx-exceptions | FileCheck %s --check-prefix=EH
 ; RUN: llc < %s -enable-emscripten-sjlj | FileCheck %s --check-prefix=SJLJ
 ; RUN: llc < %s | FileCheck %s --check-prefix=NONE
-; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -exception-model=wasm 2>&1 | FileCheck %s --check-prefix=WASM-EH-EM-EH
 
 target triple = "wasm32-unknown-unknown"
 
@@ -97,5 +96,3 @@ declare void @free(ptr)
 attributes #0 = { returns_twice }
 attributes #1 = { noreturn }
 attributes #2 = { nounwind }
-
-; WASM-EH-EM-EH: LLVM ERROR: -exception-model=wasm not allowed with -enable-emscripten-cxx-exceptions
diff --git a/llvm/test/CodeGen/WebAssembly/reg-argument.mir b/llvm/test/CodeGen/WebAssembly/reg-argument.mir
index 23e66dfc71fa1..a549990bdb0a2 100644
--- a/llvm/test/CodeGen/WebAssembly/reg-argument.mir
+++ b/llvm/test/CodeGen/WebAssembly/reg-argument.mir
@@ -68,3 +68,14 @@ body: |
     %1:externref = ARGUMENT_externref 0, implicit $arguments
     RETURN implicit-def $arguments
 ...
+---
+name: argument_exnref
+# CHECK-LABEL: argument_exnref
+body: |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %1:exnref = ARGUMENT_exnref 0
+  bb.0:
+    %0:i32 = CONST_I32 0, implicit-def $arguments
+    %1:exnref = ARGUMENT_exnref 0, implicit $arguments
+    RETURN implicit-def $arguments
+...
diff --git a/llvm/test/CodeGen/WebAssembly/reg-copy.mir b/llvm/test/CodeGen/WebAssembly/reg-copy.mir
index 31a5bfa63a4ea..763fe42d07b61 100644
--- a/llvm/test/CodeGen/WebAssembly/reg-copy.mir
+++ b/llvm/test/CodeGen/WebAssembly/reg-copy.mir
@@ -77,3 +77,14 @@ body: |
     %0:externref = COPY %1:externref
     RETURN implicit-def $arguments
 ...
+---
+name: copy_exnref
+# CHECK-LABEL: copy_exnref
+body: |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:exnref = COPY_EXNREF %1:exnref
+  ; CHECK-NEXT: RETURN
+  bb.0:
+    %0:exnref = COPY %1:exnref
+    RETURN implicit-def $arguments
+...
diff --git a/llvm/test/CodeGen/X86/apx/add.ll b/llvm/test/CodeGen/X86/apx/add.ll
index d3301ecdb72d0..7779ae599f200 100644
--- a/llvm/test/CodeGen/X86/apx/add.ll
+++ b/llvm/test/CodeGen/X86/apx/add.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @add8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: add8rr:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x00,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i8 %a, %b
     ret i8 %add
@@ -17,6 +23,12 @@ define i16 @add16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xf7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i16 %a, %b
     ret i16 %add
@@ -27,6 +39,11 @@ define i32 @add32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i32 %a, %b
     ret i32 %add
@@ -37,6 +54,11 @@ define i64 @add64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i64 %a, %b
     ret i64 %add
@@ -47,6 +69,11 @@ define i8 @add8rm(i8 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x02,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i8, ptr %ptr
     %add = add i8 %a, %b
@@ -58,6 +85,11 @@ define i16 @add16rm(i16 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x03,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i16, ptr %ptr
     %add = add i16 %a, %b
@@ -69,6 +101,11 @@ define i32 @add32rm(i32 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x03,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i32, ptr %ptr
     %add = add i32 %a, %b
@@ -80,6 +117,11 @@ define i64 @add64rm(i64 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x03,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i64, ptr %ptr
     %add = add i64 %a, %b
@@ -92,6 +134,12 @@ define i16 @add16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xc7,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i16 %a, 123
     ret i16 %add
@@ -102,6 +150,11 @@ define i32 @add32ri8(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xc7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i32 %a, 123
     ret i32 %add
@@ -112,6 +165,11 @@ define i64 @add64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq $123, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xc7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i64 %a, 123
     ret i64 %add
@@ -122,6 +180,11 @@ define i8 @add8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xc7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i8 %a, 123
     ret i8 %add
@@ -134,6 +197,13 @@ define i16 @add16ri(i16 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i16 %a, 1234
     ret i16 %add
@@ -145,6 +215,12 @@ define i32 @add32ri(i32 noundef %a) {
 ; CHECK-NEXT:    addl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i32 %a, 123456
     ret i32 %add
@@ -156,6 +232,12 @@ define i64 @add64ri(i64 noundef %a) {
 ; CHECK-NEXT:    addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq $123456, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = add i64 %a, 123456
     ret i64 %add
@@ -166,6 +248,11 @@ define i8 @add8mr(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x00,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %add = add nsw i8 %t, %b
@@ -177,6 +264,11 @@ define i16 @add16mr(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %add = add nsw i16 %t, %b
@@ -188,6 +280,11 @@ define i32 @add32mr(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %add = add nsw i32 %t, %b
@@ -199,6 +296,11 @@ define i64 @add64mr(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %add = add nsw i64 %t, %b
@@ -212,6 +314,13 @@ define i16 @add16mi8(ptr %a) {
 ; CHECK-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %add = add nsw i16 %t, 123
@@ -223,6 +332,11 @@ define i32 @add32mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x07,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x07,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %add = add nsw i32 %t, 123
@@ -234,6 +348,11 @@ define i64 @add64mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x07,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x07,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %add = add nsw i64 %t, 123
@@ -245,6 +364,11 @@ define i8 @add8mi(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x07,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x07,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %add = add nsw i8 %t, 123
@@ -259,6 +383,14 @@ define i16 @add16mi(ptr %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %add = add nsw i16 %t, 1234
@@ -271,6 +403,12 @@ define i32 @add32mi(ptr %a) {
 ; CHECK-NEXT:    addl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x07,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x07,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %add = add nsw i32 %t, 123456
@@ -283,6 +421,12 @@ define i64 @add64mi(ptr %a) {
 ; CHECK-NEXT:    addq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x07,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x07,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %add = add nsw i64 %t, 123456
@@ -303,6 +447,15 @@ define i8 @addflag8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xf7]
+; NF-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; NF-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i8 @llvm.uadd.sat.i8(i8 %a, i8 %b)
     ret i8 %add
@@ -317,6 +470,15 @@ define i16 @addflag16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xf7]
+; NF-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; NF-NEXT:    # imm = 0xFFFF
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 %b)
     ret i16 %add
@@ -329,6 +491,13 @@ define i32 @addflag32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7]
+; NF-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
     ret i32 %add
@@ -341,6 +510,13 @@ define i64 @addflag64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xf7]
+; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 %b)
     ret i64 %add
@@ -355,6 +531,15 @@ define i8 @addflag8rm(i8 noundef %a, ptr %b) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
+; NF-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; NF-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
     %add = call i8 @llvm.uadd.sat.i8(i8 %a, i8 %t)
@@ -370,6 +555,15 @@ define i16 @addflag16rm(i16 noundef %a, ptr %b) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
+; NF-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; NF-NEXT:    # imm = 0xFFFF
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
     %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 %t)
@@ -383,6 +577,13 @@ define i32 @addflag32rm(i32 noundef %a, ptr %b) {
 ; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
+; NF-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %t)
@@ -396,6 +597,13 @@ define i64 @addflag64rm(i64 noundef %a, ptr %b) {
 ; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
+; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 %t)
@@ -411,6 +619,15 @@ define i16 @addflag16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xc7,0x7b]
+; NF-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; NF-NEXT:    # imm = 0xFFFF
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 123)
     ret i16 %add
@@ -423,6 +640,13 @@ define i32 @addflag32ri8(i32 noundef %a) {
 ; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b]
+; NF-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 123)
     ret i32 %add
@@ -435,6 +659,13 @@ define i64 @addflag64ri8(i64 noundef %a) {
 ; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b]
+; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123)
     ret i64 %add
@@ -449,6 +680,15 @@ define i8 @addflag8ri(i8 noundef %a) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x7b]
+; NF-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; NF-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i8 @llvm.uadd.sat.i8(i8 %a, i8 123)
     ret i8 %add
@@ -464,6 +704,16 @@ define i16 @addflag16ri(i16 noundef %a) {
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xc7,0xd2,0x04]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; NF-NEXT:    # imm = 0xFFFF
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i16 @llvm.uadd.sat.i16(i16 %a, i16 1234)
     ret i16 %add
@@ -477,6 +727,14 @@ define i32 @addflag32ri(i32 noundef %a) {
 ; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 123456)
     ret i32 %add
@@ -490,6 +748,14 @@ define i64 @addflag64ri(i64 noundef %a) {
 ; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
 ; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: addflag64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123456)
     ret i64 %add
@@ -507,6 +773,16 @@ define i1 @add64ri_reloc(i16 %k) {
 ; CHECK-NEXT:    # fixup A - offset: 2, value: val, kind: reloc_signed_4byte
 ; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64ri_reloc:
+; NF:       # %bb.0:
+; NF-NEXT:    # kill: def $edi killed $edi def $rdi
+; NF-NEXT:    movswq %di, %rax # encoding: [0x48,0x0f,0xbf,0xc7]
+; NF-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
+; NF-NEXT:    addq $val, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: val, kind: reloc_signed_4byte
+; NF-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NF-NEXT:    retq # encoding: [0xc3]
   %g = getelementptr inbounds i16, ptr @val, i16 %k
   %cmp = icmp ne ptr %g, null
   ret i1 %cmp
@@ -517,6 +793,11 @@ define void @add8mr_legacy(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb %sil, (%rdi) # encoding: [0x40,0x00,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb %sil, (%rdi) # encoding: [0x40,0x00,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %add = add i8 %t, %b
@@ -529,6 +810,11 @@ define void @add16mr_legacy(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addw %si, (%rdi) # encoding: [0x66,0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw %si, (%rdi) # encoding: [0x66,0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %add = add i16 %t, %b
@@ -541,6 +827,11 @@ define void @add32mr_legacy(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl %esi, (%rdi) # encoding: [0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl %esi, (%rdi) # encoding: [0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %add = add i32 %t, %b
@@ -553,6 +844,11 @@ define void @add64mr_legacy(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq %rsi, (%rdi) # encoding: [0x48,0x01,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq %rsi, (%rdi) # encoding: [0x48,0x01,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %add = add i64 %t, %b
@@ -565,6 +861,11 @@ define void @add8mi_legacy(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $123, (%rdi) # encoding: [0x80,0x07,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb $123, (%rdi) # encoding: [0x80,0x07,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %add = add nsw i8 %t, 123
@@ -578,6 +879,12 @@ define void @add16mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addw $1234, (%rdi) # encoding: [0x66,0x81,0x07,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw $1234, (%rdi) # encoding: [0x66,0x81,0x07,0xd2,0x04]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %add = add nsw i16 %t, 1234
@@ -591,6 +898,12 @@ define void @add32mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addl $123456, (%rdi) # encoding: [0x81,0x07,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl $123456, (%rdi) # encoding: [0x81,0x07,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %add = add nsw i32 %t, 123456
@@ -604,6 +917,12 @@ define void @add64mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addq $123456, (%rdi) # encoding: [0x48,0x81,0x07,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: add64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq $123456, (%rdi) # encoding: [0x48,0x81,0x07,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %add = add nsw i64 %t, 123456
diff --git a/llvm/test/CodeGen/X86/apx/and.ll b/llvm/test/CodeGen/X86/apx/and.ll
index af8f4119ac054..58f54fbe50a52 100644
--- a/llvm/test/CodeGen/X86/apx/and.ll
+++ b/llvm/test/CodeGen/X86/apx/and.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @and8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: and8rr:
@@ -7,6 +8,12 @@ define i8 @and8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-NEXT:    andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i8 %a, %b
     ret i8 %and
@@ -18,6 +25,12 @@ define i16 @and16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i16 %a, %b
     ret i16 %and
@@ -28,6 +41,11 @@ define i32 @and32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i32 %a, %b
     ret i32 %and
@@ -38,6 +56,11 @@ define i64 @and64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x21,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x21,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i64 %a, %b
     ret i64 %and
@@ -48,6 +71,11 @@ define i8 @and8rm(i8 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x22,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x22,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
     %and = and i8 %a, %t
@@ -59,6 +87,11 @@ define i16 @and16rm(i16 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x23,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x23,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
     %and = and i16 %a, %t
@@ -70,6 +103,11 @@ define i32 @and32rm(i32 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x23,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x23,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
     %and = and i32 %a, %t
@@ -81,6 +119,11 @@ define i64 @and64rm(i64 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x23,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x23,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
     %and = and i64 %a, %t
@@ -93,6 +136,12 @@ define i16 @and16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    andl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xe7,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xe7,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i16 %a, 123
     ret i16 %and
@@ -103,6 +152,11 @@ define i32 @and32ri8(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xe7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xe7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i32 %a, 123
     ret i32 %and
@@ -113,6 +167,11 @@ define i64 @and64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xe7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xe7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i64 %a, 123
     ret i64 %and
@@ -123,6 +182,11 @@ define i8 @and8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xe7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xe7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i8 %a, 123
     ret i8 %and
@@ -135,6 +199,13 @@ define i16 @and16ri(i16 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xe7,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i16 %a, 1234
     ret i16 %and
@@ -146,6 +217,12 @@ define i32 @and32ri(i32 noundef %a) {
 ; CHECK-NEXT:    andl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i32 %a, 123456
     ret i32 %and
@@ -157,6 +234,12 @@ define i64 @and64ri(i64 noundef %a) {
 ; CHECK-NEXT:    andl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i64 %a, 123456
     ret i64 %and
@@ -167,6 +250,11 @@ define i8 @and8mr(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x20,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x20,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %and = and i8 %t, %b
@@ -178,6 +266,11 @@ define i16 @and16mr(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %and = and i16 %t, %b
@@ -189,6 +282,11 @@ define i32 @and32mr(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %and = and i32 %t, %b
@@ -200,6 +298,11 @@ define i64 @and64mr(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %and = and i64 %t, %b
@@ -213,6 +316,13 @@ define i16 @and16mi8(ptr %a) {
 ; CHECK-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %and = and i16 %t, 123
@@ -224,6 +334,11 @@ define i32 @and32mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x27,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x27,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %and = and i32 %t, 123
@@ -236,6 +351,12 @@ define i64 @and64mi8(ptr %a) {
 ; CHECK-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
 ; CHECK-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %and = and i64 %t, 123
@@ -247,6 +368,11 @@ define i8 @and8mi(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x27,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x27,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %and = and i8 %t, 123
@@ -261,6 +387,14 @@ define i16 @and16mi(ptr %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    andl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x25,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %and = and i16 %t, 1234
@@ -273,6 +407,12 @@ define i32 @and32mi(ptr %a) {
 ; CHECK-NEXT:    andl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x27,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} andl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x27,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %and = and i32 %t, 123456
@@ -286,6 +426,13 @@ define i64 @and64mi(ptr %a) {
 ; CHECK-NEXT:    andl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x25,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    andl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x25,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %and = and i64 %t, 123456
@@ -303,6 +450,15 @@ define i1 @andflag8rr(i8 %a, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag8rr:
+; NF:       # %bb.0:
+; NF-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
+; NF-NEXT:    andb %al, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x20,0xc7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 %b, -1
   %v0 = and i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -319,6 +475,15 @@ define i1 @andflag16rr(i16 %a, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag16rr:
+; NF:       # %bb.0:
+; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    andw %ax, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x21,0xc7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 %b, -1
   %v0 = and i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -334,6 +499,14 @@ define i1 @andflag32rr(i32 %a, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag32rr:
+; NF:       # %bb.0:
+; NF-NEXT:    andl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x21,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -348,6 +521,14 @@ define i1 @andflag64rr(i64 %a, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag64rr:
+; NF:       # %bb.0:
+; NF-NEXT:    andq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x21,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -363,6 +544,15 @@ define i1 @andflag8rm(ptr %ptr, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag8rm:
+; NF:       # %bb.0:
+; NF-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
+; NF-NEXT:    andb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x22,0x07]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
   %xor = xor i8 %b, -1
   %v0 = and i8 %a, %xor  ; 0xff << 50
@@ -380,6 +570,15 @@ define i1 @andflag16rm(ptr %ptr, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag16rm:
+; NF:       # %bb.0:
+; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    andw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x23,0x07]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %xor = xor i16 %b, -1
   %v0 = and i16 %a, %xor  ; 0xff << 50
@@ -396,6 +595,14 @@ define i1 @andflag32rm(ptr %ptr, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag32rm:
+; NF:       # %bb.0:
+; NF-NEXT:    andl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x23,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %v0 = and i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
@@ -411,6 +618,14 @@ define i1 @andflag64rm(ptr %ptr, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag64rm:
+; NF:       # %bb.0:
+; NF-NEXT:    andq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x23,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %v0 = and i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
@@ -426,6 +641,14 @@ define i1 @andflag8ri(i8 %a) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag8ri:
+; NF:       # %bb.0:
+; NF-NEXT:    andb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xe7,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 123, -1
   %v0 = and i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -442,6 +665,15 @@ define i1 @andflag16ri(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag16ri:
+; NF:       # %bb.0:
+; NF-NEXT:    andw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xe7,0x2d,0xfb]
+; NF-NEXT:    # imm = 0xFB2D
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 1234, -1
   %v0 = and i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -458,6 +690,15 @@ define i1 @andflag32ri(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag32ri:
+; NF:       # %bb.0:
+; NF-NEXT:    andl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i32 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -473,6 +714,15 @@ define i1 @andflag64ri(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag64ri:
+; NF:       # %bb.0:
+; NF-NEXT:    andq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -487,6 +737,14 @@ define i1 @andflag16ri8(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag16ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    andw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xe7,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 123, -1
   %v0 = and i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -502,6 +760,14 @@ define i1 @andflag32ri8(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag32ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    andl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xe7,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i32 %a, 123  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -516,6 +782,14 @@ define i1 @andflag64ri8(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: andflag64ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    andq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xe7,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, 123  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -527,6 +801,11 @@ define void @and8mr_legacy(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb %sil, (%rdi) # encoding: [0x40,0x20,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andb %sil, (%rdi) # encoding: [0x40,0x20,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %and = and i8 %t, %b
@@ -539,6 +818,11 @@ define void @and16mr_legacy(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andw %si, (%rdi) # encoding: [0x66,0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andw %si, (%rdi) # encoding: [0x66,0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %and = and i16 %t, %b
@@ -551,6 +835,11 @@ define void @and32mr_legacy(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andl %esi, (%rdi) # encoding: [0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andl %esi, (%rdi) # encoding: [0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %and = and i32 %t, %b
@@ -563,6 +852,11 @@ define void @and64mr_legacy(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andq %rsi, (%rdi) # encoding: [0x48,0x21,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andq %rsi, (%rdi) # encoding: [0x48,0x21,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %and = and i64 %t, %b
@@ -575,6 +869,11 @@ define void @and8mi_legacy(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andb $123, (%rdi) # encoding: [0x80,0x27,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andb $123, (%rdi) # encoding: [0x80,0x27,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %and = and i8 %t, 123
@@ -588,6 +887,12 @@ define void @and16mi_legacy(ptr %a) {
 ; CHECK-NEXT:    andw $1234, (%rdi) # encoding: [0x66,0x81,0x27,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andw $1234, (%rdi) # encoding: [0x66,0x81,0x27,0xd2,0x04]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %and = and i16 %t, 1234
@@ -601,6 +906,12 @@ define void @and32mi_legacy(ptr %a) {
 ; CHECK-NEXT:    andl $123456, (%rdi) # encoding: [0x81,0x27,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andl $123456, (%rdi) # encoding: [0x81,0x27,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %and = and i32 %t, 123456
@@ -614,6 +925,12 @@ define void @and64mi_legacy(ptr %a) {
 ; CHECK-NEXT:    andq $123456, (%rdi) # encoding: [0x48,0x81,0x27,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: and64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    andq $123456, (%rdi) # encoding: [0x48,0x81,0x27,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %and = and i64 %t, 123456
diff --git a/llvm/test/CodeGen/X86/apx/compress-evex.mir b/llvm/test/CodeGen/X86/apx/compress-evex.mir
index d8bef886e234f..5a59ab0f8a9d0 100644
--- a/llvm/test/CodeGen/X86/apx/compress-evex.mir
+++ b/llvm/test/CodeGen/X86/apx/compress-evex.mir
@@ -1,4 +1,5 @@
-# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=x86-compress-evex -show-mc-encoding -o - | FileCheck %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=x86-compress-evex -show-mc-encoding -o - | FileCheck --check-prefixes=CHECK,NDD %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr,+nf -start-before=x86-compress-evex -show-mc-encoding -o - | FileCheck --check-prefixes=CHECK,NDD-NF %s
 
 ...
 ---
@@ -46,7 +47,8 @@ name:            ndd_2_non_ndd_incommutable
 body:             |
   bb.0.entry:
     liveins: $rdi, $rsi
-    ; CHECK: subq    %rax, %rsi, %rax                # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc6]
+    ; NDD:     subq    %rax, %rsi, %rax              # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc6]
+    ; NDD-NF: {nf} subq    %rax, %rsi, %rax          # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0xc6]
     renamable $rax = ADD64rr_ND killed renamable $rdi, renamable $rsi, implicit-def dead $eflags
     renamable $rax = SUB64rr_ND killed renamable $rsi, killed renamable $rax, implicit-def dead $eflags
     RET64 $rax
@@ -55,7 +57,8 @@ body:             |
 name:            ndd_2_non_ndd_mem
 body:             |
   bb.0.entry:
-    ; CHECK: addq    $123456, (%rax), %rax           # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x00,0x40,0xe2,0x01,0x00]
+    ; NDD:    addq    $123456, (%rax), %rax          # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x00,0x40,0xe2,0x01,0x00]
+    ; NDD-NF: {nf} addq $123456, (%rax), %rax        # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x00,0x40,0xe2,0x01,0x00]
     renamable $rax = MOV64rm $noreg, 1, $noreg, 0, $fs
     renamable $rax = nsw ADD64mi32_ND killed renamable $rax, 1, $noreg, 0, $noreg, 123456, implicit-def dead $eflags
     RET64 $rax
@@ -88,5 +91,39 @@ body:             |
     ; CHECK: bswapq  %rax                            # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0xc8]
     renamable $rax = MOVBE64rr killed renamable $rax
     RET64 killed $rax
-
+...
+---
+name:            non_nf_2_nf
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $r16
+    ; CHECK:  addq %r16, %rdi                        # encoding: [0xd5,0x48,0x01,0xc7]
+    ; NDD:    xorq %r16, %rdi, %rax                  # encoding: [0x62,0xe4,0xfc,0x18,0x31,0xc7]
+    ; NDD-NF: {nf} xorq %r16, %rdi, %rax             # EVEX TO EVEX Compression encoding: [0x62,0xe4,0xfc,0x1c,0x31,0xc7]
+    ; CHECK:  addq %r16, %rax, %rdi                  # encoding: [0x62,0xe4,0xc4,0x18,0x01,0xc0]
+    ; CHECK:  adcq %rdi, %r16, %rax                  # encoding: [0x62,0xfc,0xfc,0x18,0x11,0xf8]
+    $rdi = ADD64rr $rdi, $r16, implicit-def dead $eflags
+    $rax = XOR64rr_ND $rdi, $r16, implicit-def dead $eflags
+    $rdi = ADD64rr_ND $rax, $r16, implicit-def $eflags
+    $rax = ADC64rr_ND $r16, $rdi, implicit-def dead $eflags, implicit $eflags
+    RET64 $rax
+...
+---
+name:            cfcmov_no_convert
+body:             |
+  bb.0.entry:
+    liveins: $eflags, $rax, $rbx
+    ; CHECK: cfcmovew %bx, %ax, %ax                  # encoding: [0x62,0xf4,0x7d,0x1c,0x44,0xc3]
+    ; CHECK: cfcmovsw 24(%rax), %bx, %bx             # encoding: [0x62,0xf4,0x65,0x1c,0x48,0x58,0x18]
+    ; CHECK: cfcmovel %ebx, %eax, %eax               # encoding: [0x62,0xf4,0x7c,0x1c,0x44,0xc3]
+    ; CHECK: cfcmovsl 24(%rax), %ebx, %ebx           # encoding: [0x62,0xf4,0x64,0x1c,0x48,0x58,0x18]
+    ; CHECK: cfcmoveq %rbx, %rax, %rax               # encoding: [0x62,0xf4,0xfc,0x1c,0x44,0xc3]
+    ; CHECK: cfcmovsq 24(%rax), %rbx, %rbx           # encoding: [0x62,0xf4,0xe4,0x1c,0x48,0x58,0x18]
+    $ax = CFCMOV16rr_ND $ax, $bx, 4, implicit $eflags
+    $bx = CFCMOV16rm_ND $bx, $rax, 1, $noreg, 24, $noreg, 8, implicit $eflags
+    $eax = CFCMOV32rr_ND $eax, $ebx, 4, implicit $eflags
+    $ebx = CFCMOV32rm_ND $ebx, $rax, 1, $noreg, 24, $noreg, 8, implicit $eflags
+    $rax = CFCMOV64rr_ND $rax, $rbx, 4, implicit $eflags
+    $rbx = CFCMOV64rm_ND $rbx, $rax, 1, $noreg, 24, $noreg, 8, implicit $eflags
+    RET64 $rax
 ...
diff --git a/llvm/test/CodeGen/X86/apx/dec.ll b/llvm/test/CodeGen/X86/apx/dec.ll
index fcb2cae3b5cad..a18ed2ace603a 100644
--- a/llvm/test/CodeGen/X86/apx/dec.ll
+++ b/llvm/test/CodeGen/X86/apx/dec.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
 define i8 @dec8r(i8 noundef %a) {
 ; CHECK-LABEL: dec8r:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decb %dil, %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec8r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decb %dil, %al
+; NF-NEXT:    retq
 entry:
   %dec = sub i8 %a, 1
   ret i8 %dec
@@ -17,6 +23,12 @@ define i16 @dec16r(i16 noundef %a) {
 ; CHECK-NEXT:    decl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec16r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decl %edi, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %dec = sub i16 %a, 1
   ret i16 %dec
@@ -27,6 +39,11 @@ define i32 @dec32r(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec32r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decl %edi, %eax
+; NF-NEXT:    retq
 entry:
   %dec = sub i32 %a, 1
   ret i32 %dec
@@ -37,6 +54,11 @@ define i64 @dec64r(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decq %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec64r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decq %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %dec = sub i64 %a, 1
   ret i64 %dec
@@ -47,6 +69,11 @@ define i8 @dec8m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decb (%rdi), %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec8m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decb (%rdi), %al
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %dec = sub i8 %a, 1
@@ -60,6 +87,13 @@ define i16 @dec16m(ptr %ptr) {
 ; CHECK-NEXT:    decl %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec16m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax
+; NF-NEXT:    decl %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %dec = sub i16 %a, 1
@@ -71,6 +105,11 @@ define i32 @dec32m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec32m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decl (%rdi), %eax
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %dec = sub i32 %a, 1
@@ -82,6 +121,11 @@ define i64 @dec64m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decq (%rdi), %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec64m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} decq (%rdi), %rax
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %dec = sub i64 %a, 1
@@ -93,6 +137,11 @@ define void @dec8m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decb (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec8m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    decb (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %dec = sub i8 %a, 1
@@ -105,6 +154,11 @@ define void @dec16m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decw (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec16m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    decw (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %dec = sub i16 %a, 1
@@ -117,6 +171,11 @@ define void @dec32m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decl (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec32m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    decl (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %dec = sub i32 %a, 1
@@ -129,6 +188,11 @@ define void @dec64m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    decq (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: dec64m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    decq (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %dec = sub i64 %a, 1
diff --git a/llvm/test/CodeGen/X86/apx/imul.ll b/llvm/test/CodeGen/X86/apx/imul.ll
index 2963a6477be4c..d97b2c0baec5e 100644
--- a/llvm/test/CodeGen/X86/apx/imul.ll
+++ b/llvm/test/CodeGen/X86/apx/imul.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
 define i16 @mul16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-LABEL: mul16rr:
@@ -7,6 +8,12 @@ define i16 @mul16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    imull %esi, %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imull %esi, %edi, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %mul = mul i16 %a, %b
   ret i16 %mul
@@ -17,6 +24,11 @@ define i32 @mul32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imull %esi, %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imull %esi, %edi, %eax
+; NF-NEXT:    retq
 entry:
   %mul = mul i32 %a, %b
   ret i32 %mul
@@ -27,6 +39,11 @@ define i64 @mul64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulq %rsi, %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulq %rsi, %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %mul = mul i64 %a, %b
   ret i64 %mul
@@ -37,6 +54,11 @@ define i16 @smul16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulw %si, %di, %ax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulw %si, %di, %ax
+; NF-NEXT:    retq
 entry:
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a, i16 %b)
   %mul = extractvalue {i16, i1} %t, 0
@@ -48,6 +70,11 @@ define i32 @smul32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imull %esi, %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imull %esi, %edi, %eax
+; NF-NEXT:    retq
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
   %mul = extractvalue {i32, i1} %t, 0
@@ -59,6 +86,11 @@ define i64 @smul64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulq %rsi, %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulq %rsi, %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a, i64 %b)
   %mul = extractvalue {i64, i1} %t, 0
@@ -70,6 +102,11 @@ define i16 @mul16rm(i16 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulw (%rsi), %di, %ax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulw (%rsi), %di, %ax
+; NF-NEXT:    retq
 entry:
   %b = load i16, ptr %ptr
   %mul = mul i16 %a, %b
@@ -81,6 +118,11 @@ define i32 @mul32rm(i32 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imull (%rsi), %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imull (%rsi), %edi, %eax
+; NF-NEXT:    retq
 entry:
   %b = load i32, ptr %ptr
   %mul = mul i32 %a, %b
@@ -92,6 +134,11 @@ define i64 @mul64rm(i64 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulq (%rsi), %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: mul64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulq (%rsi), %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %b = load i64, ptr %ptr
   %mul = mul i64 %a, %b
@@ -103,6 +150,11 @@ define i16 @smul16rm(i16 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulw (%rsi), %di, %ax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulw (%rsi), %di, %ax
+; NF-NEXT:    retq
 entry:
   %b = load i16, ptr %ptr
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a, i16 %b)
@@ -115,6 +167,11 @@ define i32 @smul32rm(i32 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imull (%rsi), %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imull (%rsi), %edi, %eax
+; NF-NEXT:    retq
 entry:
   %b = load i32, ptr %ptr
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
@@ -127,6 +184,11 @@ define i64 @smul64rm(i64 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    imulq (%rsi), %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: smul64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} imulq (%rsi), %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %b = load i64, ptr %ptr
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a, i64 %b)
diff --git a/llvm/test/CodeGen/X86/apx/inc.ll b/llvm/test/CodeGen/X86/apx/inc.ll
index a9c6d740cf2ce..8d31badb99779 100644
--- a/llvm/test/CodeGen/X86/apx/inc.ll
+++ b/llvm/test/CodeGen/X86/apx/inc.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
 define i8 @inc8r(i8 noundef %a) {
 ; CHECK-LABEL: inc8r:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incb %dil, %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc8r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incb %dil, %al
+; NF-NEXT:    retq
 entry:
   %inc = add i8 %a, 1
   ret i8 %inc
@@ -17,6 +23,12 @@ define i16 @inc16r(i16 noundef %a) {
 ; CHECK-NEXT:    incl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc16r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incl %edi, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %inc = add i16 %a, 1
   ret i16 %inc
@@ -27,6 +39,11 @@ define i32 @inc32r(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc32r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incl %edi, %eax
+; NF-NEXT:    retq
 entry:
   %inc = add i32 %a, 1
   ret i32 %inc
@@ -37,6 +54,11 @@ define i64 @inc64r(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incq %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc64r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incq %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %inc = add i64 %a, 1
   ret i64 %inc
@@ -47,6 +69,11 @@ define i8 @inc8m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incb (%rdi), %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc8m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incb (%rdi), %al
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %inc = add i8 %a, 1
@@ -60,6 +87,13 @@ define i16 @inc16m(ptr %ptr) {
 ; CHECK-NEXT:    incl %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc16m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax
+; NF-NEXT:    incl %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %inc = add i16 %a, 1
@@ -71,6 +105,11 @@ define i32 @inc32m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc32m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incl (%rdi), %eax
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %inc = add i32 %a, 1
@@ -82,6 +121,11 @@ define i64 @inc64m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incq (%rdi), %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc64m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} incq (%rdi), %rax
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %inc = add i64 %a, 1
@@ -97,6 +141,15 @@ define i8 @uinc8r(i8 noundef %a) {
 ; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uinc8r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incb %dil, %al
+; NF-NEXT:    movzbl %al, %eax
+; NF-NEXT:    movl $255, %ecx
+; NF-NEXT:    cmovel %ecx, %eax
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq
 entry:
   %inc = call i8 @llvm.uadd.sat.i8(i8 %a, i8 1)
   ret i8 %inc
@@ -110,6 +163,14 @@ define i16 @uinc16r(i16 noundef %a) {
 ; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uinc16r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incw %di, %ax
+; NF-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; NF-NEXT:    cmovel %ecx, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %inc = call i16 @llvm.uadd.sat.i16(i16 %a, i16 1)
   ret i16 %inc
@@ -122,6 +183,13 @@ define i32 @uinc32r(i32 noundef %a) {
 ; CHECK-NEXT:    movl $-1, %ecx
 ; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uinc32r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incl %edi, %eax
+; NF-NEXT:    movl $-1, %ecx
+; NF-NEXT:    cmovel %ecx, %eax
+; NF-NEXT:    retq
 entry:
   %inc = call i32 @llvm.uadd.sat.i32(i32 %a, i32 1)
   ret i32 %inc
@@ -134,6 +202,13 @@ define i64 @uinc64r(i64 noundef %a) {
 ; CHECK-NEXT:    movq $-1, %rcx
 ; CHECK-NEXT:    cmoveq %rcx, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uinc64r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incq %rdi, %rax
+; NF-NEXT:    movq $-1, %rcx
+; NF-NEXT:    cmoveq %rcx, %rax
+; NF-NEXT:    retq
 entry:
   %inc = call i64 @llvm.uadd.sat.i64(i64 %a, i64 1)
   ret i64 %inc
@@ -149,6 +224,11 @@ define void @inc8m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incb (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc8m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incb (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %inc = add i8 %a, 1
@@ -161,6 +241,11 @@ define void @inc16m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incw (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc16m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incw (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %inc = add i16 %a, 1
@@ -173,6 +258,11 @@ define void @inc32m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incl (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc32m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incl (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %inc = add i32 %a, 1
@@ -185,6 +275,11 @@ define void @inc64m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incq (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: inc64m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    incq (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %inc = add i64 %a, 1
diff --git a/llvm/test/CodeGen/X86/apx/neg.ll b/llvm/test/CodeGen/X86/apx/neg.ll
index c1c53fbdaebd8..5e033e33cb8b2 100644
--- a/llvm/test/CodeGen/X86/apx/neg.ll
+++ b/llvm/test/CodeGen/X86/apx/neg.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
 define i8 @neg8r(i8 noundef %a) {
 ; CHECK-LABEL: neg8r:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negb %dil, %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg8r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negb %dil, %al
+; NF-NEXT:    retq
 entry:
   %neg = sub i8 0, %a
   ret i8 %neg
@@ -17,6 +23,12 @@ define i16 @neg16r(i16 noundef %a) {
 ; CHECK-NEXT:    negl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg16r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl %edi, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %neg = sub i16 0, %a
   ret i16 %neg
@@ -27,6 +39,11 @@ define i32 @neg32r(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg32r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl %edi, %eax
+; NF-NEXT:    retq
 entry:
   %neg = sub i32 0, %a
   ret i32 %neg
@@ -37,6 +54,11 @@ define i64 @neg64r(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negq %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg64r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negq %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %neg = sub i64 0, %a
   ret i64 %neg
@@ -47,6 +69,11 @@ define i8 @neg8m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negb (%rdi), %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg8m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negb (%rdi), %al
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %neg = sub i8 0, %a
@@ -58,6 +85,11 @@ define i16 @neg16m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negw (%rdi), %ax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg16m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negw (%rdi), %ax
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %neg = sub i16 0, %a
@@ -69,6 +101,11 @@ define i32 @neg32m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg32m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl (%rdi), %eax
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %neg = sub i32 0, %a
@@ -80,6 +117,11 @@ define i64 @neg64m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negq (%rdi), %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg64m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negq (%rdi), %rax
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %neg = sub i64 0, %a
@@ -91,6 +133,11 @@ define i8 @uneg8r(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negb %dil, %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg8r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negb %dil, %al
+; NF-NEXT:    retq
 entry:
   %t = call {i8, i1} @llvm.usub.with.overflow.i8(i8 0, i8 %a)
   %neg = extractvalue {i8, i1} %t, 0
@@ -103,6 +150,12 @@ define i16 @uneg16r(i16 noundef %a) {
 ; CHECK-NEXT:    negl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg16r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl %edi, %eax
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq
 entry:
   %t = call {i16, i1} @llvm.usub.with.overflow.i16(i16 0, i16 %a)
   %neg = extractvalue {i16, i1} %t, 0
@@ -114,6 +167,11 @@ define i32 @uneg32r(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg32r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl %edi, %eax
+; NF-NEXT:    retq
 entry:
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 0, i32 %a)
   %neg = extractvalue {i32, i1} %t, 0
@@ -125,6 +183,11 @@ define i64 @uneg64r(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negq %rdi, %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg64r:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negq %rdi, %rax
+; NF-NEXT:    retq
 entry:
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 0, i64 %a)
   %neg = extractvalue {i64, i1} %t, 0
@@ -136,6 +199,11 @@ define i8 @uneg8m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negb (%rdi), %al
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg8m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negb (%rdi), %al
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %t = call {i8, i1} @llvm.usub.with.overflow.i8(i8 0, i8 %a)
@@ -148,6 +216,11 @@ define i16 @uneg16m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negw (%rdi), %ax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg16m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negw (%rdi), %ax
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %t = call {i16, i1} @llvm.usub.with.overflow.i16(i16 0, i16 %a)
@@ -160,6 +233,11 @@ define i32 @uneg32m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negl (%rdi), %eax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg32m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negl (%rdi), %eax
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 0, i32 %a)
@@ -172,6 +250,11 @@ define i64 @uneg64m(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negq (%rdi), %rax
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: uneg64m:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} negq (%rdi), %rax
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 0, i64 %a)
@@ -189,6 +272,11 @@ define void @neg8m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negb (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg8m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    negb (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %neg = sub i8 0, %a
@@ -201,6 +289,11 @@ define void @neg16m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negw (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg16m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    negw (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %neg = sub i16 0, %a
@@ -213,6 +306,11 @@ define void @neg32m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negl (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg32m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    negl (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %neg = sub i32 0, %a
@@ -225,6 +323,11 @@ define void @neg64m_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    negq (%rdi)
 ; CHECK-NEXT:    retq
+;
+; NF-LABEL: neg64m_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    negq (%rdi)
+; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %neg = sub i64 0, %a
diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index 3d024e962400f..d404279e14f7a 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @or8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: or8rr:
@@ -7,6 +8,12 @@ define i8 @or8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-NEXT:    orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i8 %a, %b
     ret i8 %or
@@ -18,6 +25,12 @@ define i16 @or16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i16 %a, %b
     ret i16 %or
@@ -28,6 +41,11 @@ define i32 @or32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i32 %a, %b
     ret i32 %or
@@ -38,6 +56,11 @@ define i64 @or64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x09,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x09,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i64 %a, %b
     ret i64 %or
@@ -48,6 +71,11 @@ define i8 @or8rm(i8 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x0a,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x0a,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
     %or = or i8 %a, %t
@@ -59,6 +87,11 @@ define i16 @or16rm(i16 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x0b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x0b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
     %or = or i16 %a, %t
@@ -70,6 +103,11 @@ define i32 @or32rm(i32 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x0b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x0b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
     %or = or i32 %a, %t
@@ -81,6 +119,11 @@ define i64 @or64rm(i64 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x0b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x0b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
     %or = or i64 %a, %t
@@ -93,6 +136,12 @@ define i16 @or16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    orl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xcf,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xcf,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i16 %a, 123
     ret i16 %or
@@ -103,6 +152,11 @@ define i32 @or32ri8(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xcf,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xcf,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i32 %a, 123
     ret i32 %or
@@ -113,6 +167,11 @@ define i64 @or64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xcf,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq $123, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xcf,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i64 %a, 123
     ret i64 %or
@@ -123,6 +182,11 @@ define i8 @or8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xcf,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xcf,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i8 %a, 123
     ret i8 %or
@@ -135,6 +199,13 @@ define i16 @or16ri(i16 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xcf,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i16 %a, 1234
     ret i16 %or
@@ -146,6 +217,12 @@ define i32 @or32ri(i32 noundef %a) {
 ; CHECK-NEXT:    orl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i32 %a, 123456
     ret i32 %or
@@ -157,6 +234,12 @@ define i64 @or64ri(i64 noundef %a) {
 ; CHECK-NEXT:    orq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq $123456, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i64 %a, 123456
     ret i64 %or
@@ -167,6 +250,11 @@ define i8 @or8mr(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x08,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x08,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %or = or i8 %t, %b
@@ -178,6 +266,11 @@ define i16 @or16mr(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %or = or i16 %t, %b
@@ -189,6 +282,11 @@ define i32 @or32mr(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %or = or i32 %t, %b
@@ -200,6 +298,11 @@ define i64 @or64mr(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %or = or i64 %t, %b
@@ -213,6 +316,13 @@ define i16 @or16mi8(ptr %a) {
 ; CHECK-NEXT:    orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %or = or i16 %t, 123
@@ -224,6 +334,11 @@ define i32 @or32mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x0f,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x0f,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %or = or i32 %t, 123
@@ -235,6 +350,11 @@ define i64 @or64mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x0f,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x0f,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %or = or i64 %t, 123
@@ -246,6 +366,11 @@ define i8 @or8mi(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x0f,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x0f,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %or = or i8 %t, 123
@@ -260,6 +385,14 @@ define i16 @or16mi(ptr %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    orl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x0d,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %or = or i16 %t, 1234
@@ -272,6 +405,12 @@ define i32 @or32mi(ptr %a) {
 ; CHECK-NEXT:    orl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %or = or i32 %t, 123456
@@ -284,6 +423,12 @@ define i64 @or64mi(ptr %a) {
 ; CHECK-NEXT:    orq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} orq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %or = or i64 %t, 123456
@@ -301,6 +446,15 @@ define i1 @orflag8rr(i8 %a, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag8rr:
+; NF:       # %bb.0:
+; NF-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
+; NF-NEXT:    orb %al, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x08,0xc7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 %b, -1
   %v0 = or i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -317,6 +471,15 @@ define i1 @orflag16rr(i16 %a, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag16rr:
+; NF:       # %bb.0:
+; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    orw %ax, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x09,0xc7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 %b, -1
   %v0 = or i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -332,6 +495,14 @@ define i1 @orflag32rr(i32 %a, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag32rr:
+; NF:       # %bb.0:
+; NF-NEXT:    orl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x09,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -346,6 +517,14 @@ define i1 @orflag64rr(i64 %a, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag64rr:
+; NF:       # %bb.0:
+; NF-NEXT:    orq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x09,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -361,6 +540,15 @@ define i1 @orflag8rm(ptr %ptr, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag8rm:
+; NF:       # %bb.0:
+; NF-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
+; NF-NEXT:    orb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x0a,0x07]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
   %xor = xor i8 %b, -1
   %v0 = or i8 %a, %xor  ; 0xff << 50
@@ -378,6 +566,15 @@ define i1 @orflag16rm(ptr %ptr, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag16rm:
+; NF:       # %bb.0:
+; NF-NEXT:    notl %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd6]
+; NF-NEXT:    orw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x0b,0x07]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %xor = xor i16 %b, -1
   %v0 = or i16 %a, %xor  ; 0xff << 50
@@ -394,6 +591,14 @@ define i1 @orflag32rm(ptr %ptr, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag32rm:
+; NF:       # %bb.0:
+; NF-NEXT:    orl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x0b,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %v0 = or i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
@@ -409,6 +614,14 @@ define i1 @orflag64rm(ptr %ptr, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag64rm:
+; NF:       # %bb.0:
+; NF-NEXT:    orq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x0b,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %v0 = or i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
@@ -424,6 +637,14 @@ define i1 @orflag8ri(i8 %a) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag8ri:
+; NF:       # %bb.0:
+; NF-NEXT:    orb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xcf,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 123, -1
   %v0 = or i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -440,6 +661,15 @@ define i1 @orflag16ri(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag16ri:
+; NF:       # %bb.0:
+; NF-NEXT:    orw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xcf,0x2d,0xfb]
+; NF-NEXT:    # imm = 0xFB2D
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 1234, -1
   %v0 = or i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -456,6 +686,15 @@ define i1 @orflag32ri(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag32ri:
+; NF:       # %bb.0:
+; NF-NEXT:    orl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i32 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -471,6 +710,15 @@ define i1 @orflag64ri(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag64ri:
+; NF:       # %bb.0:
+; NF-NEXT:    orq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i64 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -485,6 +733,14 @@ define i1 @orflag16ri8(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag16ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    orw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xcf,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 123, -1
   %v0 = or i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -500,6 +756,14 @@ define i1 @orflag32ri8(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag32ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    orl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xcf,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i32 %a, 123  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -514,6 +778,14 @@ define i1 @orflag64ri8(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: orflag64ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    orq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xcf,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = or i64 %a, 123  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -525,6 +797,11 @@ define void @or8mr_legacy(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb %sil, (%rdi) # encoding: [0x40,0x08,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orb %sil, (%rdi) # encoding: [0x40,0x08,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %or = or i8 %t, %b
@@ -537,6 +814,11 @@ define void @or16mr_legacy(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orw %si, (%rdi) # encoding: [0x66,0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orw %si, (%rdi) # encoding: [0x66,0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %or = or i16 %t, %b
@@ -549,6 +831,11 @@ define void @or32mr_legacy(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orl %esi, (%rdi) # encoding: [0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orl %esi, (%rdi) # encoding: [0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %or = or i32 %t, %b
@@ -561,6 +848,11 @@ define void @or64mr_legacy(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orq %rsi, (%rdi) # encoding: [0x48,0x09,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orq %rsi, (%rdi) # encoding: [0x48,0x09,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %or = or i64 %t, %b
@@ -573,6 +865,11 @@ define void @or8mi_legacy(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    orb $123, (%rdi) # encoding: [0x80,0x0f,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orb $123, (%rdi) # encoding: [0x80,0x0f,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %or = or i8 %t, 123
@@ -586,6 +883,12 @@ define void @or16mi_legacy(ptr %a) {
 ; CHECK-NEXT:    orw $1234, (%rdi) # encoding: [0x66,0x81,0x0f,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orw $1234, (%rdi) # encoding: [0x66,0x81,0x0f,0xd2,0x04]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %or = or i16 %t, 1234
@@ -599,6 +902,12 @@ define void @or32mi_legacy(ptr %a) {
 ; CHECK-NEXT:    orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %or = or i32 %t, 123456
@@ -612,6 +921,12 @@ define void @or64mi_legacy(ptr %a) {
 ; CHECK-NEXT:    orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: or64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %or = or i64 %t, 123456
diff --git a/llvm/test/CodeGen/X86/apx/shl.ll b/llvm/test/CodeGen/X86/apx/shl.ll
index 869caf932ff92..35b6cb27254b2 100644
--- a/llvm/test/CodeGen/X86/apx/shl.ll
+++ b/llvm/test/CodeGen/X86/apx/shl.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @shl8ri(i8 noundef %a) {
 ; CHECK-LABEL: shl8ri:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlb $4, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xe7,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shlb $4, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0xe7,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i8 %a, 4
   ret i8 %shl
@@ -17,6 +23,12 @@ define i16 @shl16ri(i16 noundef %a) {
 ; CHECK-NEXT:    shll $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xe7,0x04]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shll $4, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0xe7,0x04]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i16 %a, 4
   ret i16 %shl
@@ -27,6 +39,11 @@ define i32 @shl32ri(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shll $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xe7,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shll $4, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0xe7,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i32 %a, 4
   ret i32 %shl
@@ -37,6 +54,11 @@ define i64 @shl64ri(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlq $4, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xe7,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shlq $4, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0xe7,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i64 %a, 4
   ret i64 %shl
@@ -48,6 +70,12 @@ define i8 @shl8m1(ptr %ptr) {
 ; CHECK-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
 ; CHECK-NEXT:    addb %al, %al # EVEX TO LEGACY Compression encoding: [0x00,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    addb %al, %al # EVEX TO LEGACY Compression encoding: [0x00,0xc0]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, 1
@@ -61,6 +89,13 @@ define i16 @shl16m1(ptr %ptr) {
 ; CHECK-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, 1
@@ -73,6 +108,12 @@ define i32 @shl32m1(ptr %ptr) {
 ; CHECK-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
 ; CHECK-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, 1
@@ -85,6 +126,12 @@ define i64 @shl64m1(ptr %ptr) {
 ; CHECK-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
 ; CHECK-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, 1
@@ -98,6 +145,13 @@ define i8 @shl8mcl(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shlb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, %cl
@@ -111,6 +165,13 @@ define i8 @shl8mcl_mask(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shlb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shamt = and i8 %cl, 31
@@ -127,6 +188,15 @@ define i16 @shl16mcl(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, %cl
@@ -142,6 +212,15 @@ define i16 @shl16mcl_mask(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shamt = and i16 %cl, 31
@@ -156,6 +235,13 @@ define i32 @shl32mcl(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, %cl
@@ -169,6 +255,13 @@ define i32 @shl32mcl_mask(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shamt = and i32 %cl, 31
@@ -183,6 +276,13 @@ define i64 @shl64mcl(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shlq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, %cl
@@ -196,6 +296,13 @@ define i64 @shl64mcl_mask(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shlq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shamt = and i64 %cl, 63
@@ -208,6 +315,11 @@ define i8 @shl8mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shlb $4, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, 4
@@ -221,6 +333,13 @@ define i16 @shl16mi(ptr %ptr) {
 ; CHECK-NEXT:    shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, 4
@@ -232,6 +351,11 @@ define i32 @shl32mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shll $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shll $4, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, 4
@@ -243,6 +367,11 @@ define i64 @shl64mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shlq $4, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, 4
@@ -254,6 +383,11 @@ define i8 @shl8r1(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb %dil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb %dil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x00,0xff]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i8 %a, 1
   ret i8 %shl
@@ -265,6 +399,12 @@ define i16 @shl16r1(i16 noundef %a) {
 ; CHECK-NEXT:    addl %edi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xff]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl %edi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xff]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i16 %a, 1
   ret i16 %shl
@@ -275,6 +415,11 @@ define i32 @shl32r1(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addl %edi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl %edi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x01,0xff]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i32 %a, 1
   ret i32 %shl
@@ -285,6 +430,11 @@ define i64 @shl64r1(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addq %rdi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xff]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addq %rdi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x01,0xff]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i64 %a, 1
   ret i64 %shl
@@ -297,6 +447,13 @@ define i8 @shl8rcl(i8 noundef %a, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shlb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i8 %a, %cl
   ret i8 %shl
@@ -309,6 +466,13 @@ define i8 @shl8rcl_mask(i8 noundef %a, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shlb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i8 %cl, 31
   %shl = shl i8 %a, %shamt
@@ -323,6 +487,14 @@ define i16 @shl16rcl(i16 noundef %a, i16 %cl) {
 ; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i16 %a, %cl
   ret i16 %shl
@@ -336,6 +508,14 @@ define i16 @shl16rcl_mask(i16 noundef %a, i16 %cl) {
 ; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i16 %cl, 31
   %shl = shl i16 %a, %shamt
@@ -349,6 +529,13 @@ define i32 @shl32rcl(i32 noundef %a, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i32 %a, %cl
   ret i32 %shl
@@ -361,6 +548,13 @@ define i32 @shl32rcl_mask(i32 noundef %a, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shll %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i32 %cl, 31
   %shl = shl i32 %a, %shamt
@@ -374,6 +568,13 @@ define i64 @shl64rcl(i64 noundef %a, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shlq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shlq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shl = shl i64 %a, %cl
   ret i64 %shl
@@ -386,6 +587,13 @@ define i64 @shl64rcl_mask(i64 noundef %a, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shlq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xe7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shlq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xe7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i64 %cl, 63
   %shl = shl i64 %a, %shamt
@@ -397,6 +605,11 @@ define void @shl8m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlb (%rdi) # encoding: [0xd0,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlb (%rdi) # encoding: [0xd0,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, 1
@@ -409,6 +622,11 @@ define void @shl16m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlw (%rdi) # encoding: [0x66,0xd1,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlw (%rdi) # encoding: [0x66,0xd1,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, 1
@@ -421,6 +639,11 @@ define void @shl32m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shll (%rdi) # encoding: [0xd1,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shll (%rdi) # encoding: [0xd1,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, 1
@@ -433,6 +656,11 @@ define void @shl64m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlq (%rdi) # encoding: [0x48,0xd1,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlq (%rdi) # encoding: [0x48,0xd1,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, 1
@@ -445,6 +673,11 @@ define void @shl8mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlb $4, (%rdi) # encoding: [0xc0,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlb $4, (%rdi) # encoding: [0xc0,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, 4
@@ -457,6 +690,11 @@ define void @shl16mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlw $4, (%rdi) # encoding: [0x66,0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlw $4, (%rdi) # encoding: [0x66,0xc1,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, 4
@@ -469,6 +707,11 @@ define void @shl32mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shll $4, (%rdi) # encoding: [0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shll $4, (%rdi) # encoding: [0xc1,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, 4
@@ -481,6 +724,11 @@ define void @shl64mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shlq $4, (%rdi) # encoding: [0x48,0xc1,0x27,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shlq $4, (%rdi) # encoding: [0x48,0xc1,0x27,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, 4
@@ -495,6 +743,13 @@ define void @shl8mcl_legacy(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, (%rdi) # encoding: [0xd2,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl8mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shlb %cl, (%rdi) # encoding: [0xd2,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shl = shl i8 %a, %cl
@@ -509,6 +764,13 @@ define void @shl16mcl_legacy(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlw %cl, (%rdi) # encoding: [0x66,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl16mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shlw %cl, (%rdi) # encoding: [0x66,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shl = shl i16 %a, %cl
@@ -523,6 +785,13 @@ define void @shl32mcl_legacy(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, (%rdi) # encoding: [0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl32mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shll %cl, (%rdi) # encoding: [0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shl = shl i32 %a, %cl
@@ -537,6 +806,13 @@ define void @shl64mcl_legacy(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shlq %cl, (%rdi) # encoding: [0x48,0xd3,0x27]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shl64mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    shlq %cl, (%rdi) # encoding: [0x48,0xd3,0x27]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shl = shl i64 %a, %cl
diff --git a/llvm/test/CodeGen/X86/apx/shr.ll b/llvm/test/CodeGen/X86/apx/shr.ll
index a7e02d8586f49..b5b91b02fedff 100644
--- a/llvm/test/CodeGen/X86/apx/shr.ll
+++ b/llvm/test/CodeGen/X86/apx/shr.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @shr8m1(ptr %ptr) {
 ; CHECK-LABEL: shr8m1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrb (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd0,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, 1
@@ -19,6 +25,13 @@ define i16 @shr16m1(ptr %ptr) {
 ; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, 1
@@ -30,6 +43,11 @@ define i32 @shr32m1(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrl (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd1,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, 1
@@ -41,6 +59,11 @@ define i64 @shr64m1(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64m1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrq (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd1,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, 1
@@ -54,6 +77,13 @@ define i8 @shr8mcl(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, %cl
@@ -67,6 +97,13 @@ define i8 @shr8mcl_mask(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shamt = and i8 %cl, 31
@@ -83,6 +120,15 @@ define i16 @shr16mcl(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, %cl
@@ -98,6 +144,15 @@ define i16 @shr16mcl_mask(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shamt = and i16 %cl, 31
@@ -112,6 +167,13 @@ define i32 @shr32mcl(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrl %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, %cl
@@ -125,6 +187,13 @@ define i32 @shr32mcl_mask(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrl %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shamt = and i32 %cl, 31
@@ -139,6 +208,13 @@ define i64 @shr64mcl(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64mcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shrq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, %cl
@@ -152,6 +228,13 @@ define i64 @shr64mcl_mask(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64mcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shrq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shamt = and i64 %cl, 63
@@ -164,6 +247,11 @@ define i8 @shr8mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrb $4, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, 4
@@ -177,6 +265,13 @@ define i16 @shr16mi(ptr %ptr) {
 ; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, 4
@@ -188,6 +283,11 @@ define i32 @shr32mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrl $4, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, 4
@@ -199,6 +299,11 @@ define i64 @shr64mi(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrq $4, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, 4
@@ -210,6 +315,11 @@ define i8 @shr8r1(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrb %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd0,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i8 %a, 1
   ret i8 %shr
@@ -222,6 +332,13 @@ define i16 @shr16r1(i16 noundef %a) {
 ; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; NF-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i16 %a, 1
   ret i16 %shr
@@ -232,6 +349,11 @@ define i32 @shr32r1(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrl %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd1,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i32 %a, 1
   ret i32 %shr
@@ -242,6 +364,11 @@ define i64 @shr64r1(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64r1:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrq %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd1,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i64 %a, 1
   ret i64 %shr
@@ -254,6 +381,13 @@ define i8 @shr8rcl(i8 noundef %a, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i8 %a, %cl
   ret i8 %shr
@@ -266,6 +400,13 @@ define i8 @shr8rcl_mask(i8 noundef %a, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrb %cl, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i8 %cl, 31
   %shr = lshr i8 %a, %shamt
@@ -281,6 +422,15 @@ define i16 @shr16rcl(i16 noundef %a, i16 %cl) {
 ; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i16 %a, %cl
   ret i16 %shr
@@ -295,6 +445,15 @@ define i16 @shr16rcl_mask(i16 noundef %a, i16 %cl) {
 ; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i16 %cl, 31
   %shr = lshr i16 %a, %shamt
@@ -308,6 +467,13 @@ define i32 @shr32rcl(i32 noundef %a, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrl %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i32 %a, %cl
   ret i32 %shr
@@ -320,6 +486,13 @@ define i32 @shr32rcl_mask(i32 noundef %a, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    {nf} shrl %cl, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i32 %cl, 31
   %shr = lshr i32 %a, %shamt
@@ -333,6 +506,13 @@ define i64 @shr64rcl(i64 noundef %a, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64rcl:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shrq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i64 %a, %cl
   ret i64 %shr
@@ -345,6 +525,13 @@ define i64 @shr64rcl_mask(i64 noundef %a, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xef]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64rcl_mask:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    {nf} shrq %cl, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0xef]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i64 %cl, 63
   %shr = lshr i64 %a, %shamt
@@ -356,6 +543,11 @@ define i8 @shr8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb $4, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xef,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrb $4, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0xef,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i8 %a, 4
   ret i8 %shr
@@ -368,6 +560,13 @@ define i16 @shr16ri(i16 noundef %a) {
 ; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; NF-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i16 %a, 4
   ret i16 %shr
@@ -378,6 +577,11 @@ define i32 @shr32ri(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xef,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrl $4, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0xef,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i32 %a, 4
   ret i32 %shr
@@ -388,6 +592,11 @@ define i64 @shr64ri(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq $4, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xef,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} shrq $4, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0xef,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %shr = lshr i64 %a, 4
   ret i64 %shr
@@ -398,6 +607,11 @@ define void @shr8m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb (%rdi) # encoding: [0xd0,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrb (%rdi) # encoding: [0xd0,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, 1
@@ -410,6 +624,11 @@ define void @shr16m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrw (%rdi) # encoding: [0x66,0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrw (%rdi) # encoding: [0x66,0xd1,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, 1
@@ -422,6 +641,11 @@ define void @shr32m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl (%rdi) # encoding: [0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrl (%rdi) # encoding: [0xd1,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, 1
@@ -434,6 +658,11 @@ define void @shr64m1_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq (%rdi) # encoding: [0x48,0xd1,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64m1_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrq (%rdi) # encoding: [0x48,0xd1,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, 1
@@ -446,6 +675,11 @@ define void @shr8mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrb $4, (%rdi) # encoding: [0xc0,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrb $4, (%rdi) # encoding: [0xc0,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, 4
@@ -458,6 +692,11 @@ define void @shr16mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrw $4, (%rdi) # encoding: [0x66,0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrw $4, (%rdi) # encoding: [0x66,0xc1,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, 4
@@ -470,6 +709,11 @@ define void @shr32mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrl $4, (%rdi) # encoding: [0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrl $4, (%rdi) # encoding: [0xc1,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, 4
@@ -482,6 +726,11 @@ define void @shr64mi_legacy(ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    shrq $4, (%rdi) # encoding: [0x48,0xc1,0x2f,0x04]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    shrq $4, (%rdi) # encoding: [0x48,0xc1,0x2f,0x04]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, 4
@@ -496,6 +745,13 @@ define void @shr8mcl_legacy(ptr %ptr, i8 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, (%rdi) # encoding: [0xd2,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr8mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrb %cl, (%rdi) # encoding: [0xd2,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shr = lshr i8 %a, %cl
@@ -510,6 +766,13 @@ define void @shr16mcl_legacy(ptr %ptr, i16 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrw %cl, (%rdi) # encoding: [0x66,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr16mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrw %cl, (%rdi) # encoding: [0x66,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shr = lshr i16 %a, %cl
@@ -524,6 +787,13 @@ define void @shr32mcl_legacy(ptr %ptr, i32 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, (%rdi) # encoding: [0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr32mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NF-NEXT:    shrl %cl, (%rdi) # encoding: [0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shr = lshr i32 %a, %cl
@@ -538,6 +808,13 @@ define void @shr64mcl_legacy(ptr %ptr, i64 %cl) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, (%rdi) # encoding: [0x48,0xd3,0x2f]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: shr64mcl_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NF-NEXT:    shrq %cl, (%rdi) # encoding: [0x48,0xd3,0x2f]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shr = lshr i64 %a, %cl
diff --git a/llvm/test/CodeGen/X86/apx/sub.ll b/llvm/test/CodeGen/X86/apx/sub.ll
index be0914c90b9fa..a38d09587ba91 100644
--- a/llvm/test/CodeGen/X86/apx/sub.ll
+++ b/llvm/test/CodeGen/X86/apx/sub.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @sub8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: sub8rr:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x28,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i8 %a, %b
     ret i8 %sub
@@ -17,6 +23,12 @@ define i16 @sub16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    subl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0xf7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i16 %a, %b
     ret i16 %sub
@@ -27,6 +39,11 @@ define i32 @sub32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i32 %a, %b
     ret i32 %sub
@@ -37,6 +54,11 @@ define i64 @sub64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i64 %a, %b
     ret i64 %sub
@@ -47,6 +69,11 @@ define i8 @sub8rm(i8 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x2a,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x2a,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i8, ptr %ptr
     %sub = sub i8 %a, %b
@@ -58,6 +85,11 @@ define i16 @sub16rm(i16 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x2b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x2b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i16, ptr %ptr
     %sub = sub i16 %a, %b
@@ -69,6 +101,11 @@ define i32 @sub32rm(i32 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x2b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x2b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i32, ptr %ptr
     %sub = sub i32 %a, %b
@@ -80,6 +117,11 @@ define i64 @sub64rm(i64 noundef %a, ptr %ptr) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x2b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x2b,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i64, ptr %ptr
     %sub = sub i64 %a, %b
@@ -92,6 +134,12 @@ define i16 @sub16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    subl $-128, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xef,0x80]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl $-128, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xef,0x80]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i16 %a, -128
     ret i16 %sub
@@ -102,6 +150,11 @@ define i32 @sub32ri8(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl $-128, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xef,0x80]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl $-128, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xef,0x80]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i32 %a, -128
     ret i32 %sub
@@ -112,6 +165,11 @@ define i64 @sub64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $-128, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xef,0x80]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq $-128, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xef,0x80]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i64 %a, -128
     ret i64 %sub
@@ -122,6 +180,11 @@ define i8 @sub8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $-123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb $-123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xc7,0x85]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i8 %a, 123
     ret i8 %sub
@@ -134,6 +197,13 @@ define i16 @sub16ri(i16 noundef %a) {
 ; CHECK-NEXT:    # imm = 0xFB2E
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $-1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0x2e,0xfb,0xff,0xff]
+; NF-NEXT:    # imm = 0xFB2E
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i16 %a, 1234
     ret i16 %sub
@@ -145,6 +215,12 @@ define i32 @sub32ri(i32 noundef %a) {
 ; CHECK-NEXT:    addl $-123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $-123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xc7,0xc0,0x1d,0xfe,0xff]
+; NF-NEXT:    # imm = 0xFFFE1DC0
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i32 %a, 123456
     ret i32 %sub
@@ -156,6 +232,12 @@ define i64 @sub64ri(i64 noundef %a) {
 ; CHECK-NEXT:    subq $-2147483648, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xef,0x00,0x00,0x00,0x80]
 ; CHECK-NEXT:    # imm = 0x80000000
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq $-2147483648, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xef,0x00,0x00,0x00,0x80]
+; NF-NEXT:    # imm = 0x80000000
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i64 %a, -2147483648
     ret i64 %sub
@@ -166,6 +248,11 @@ define i8 @sub8mr(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x28,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %sub = sub nsw i8 %t, %b
@@ -179,6 +266,13 @@ define i16 @sub16mr(ptr %a, i16 noundef %b) {
 ; CHECK-NEXT:    subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %sub = sub nsw i16 %t, %b
@@ -190,6 +284,11 @@ define i32 @sub32mr(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %sub = sub nsw i32 %t, %b
@@ -201,6 +300,11 @@ define i64 @sub64mr(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %sub = sub nsw i64 %t, %b
@@ -214,6 +318,13 @@ define i16 @sub16mi8(ptr %a) {
 ; CHECK-NEXT:    subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %sub = sub nsw i16 %t, -128
@@ -225,6 +336,11 @@ define i32 @sub32mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl $-128, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x2f,0x80]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subl $-128, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x2f,0x80]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %sub = sub nsw i32 %t, -128
@@ -236,6 +352,11 @@ define i64 @sub64mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $-128, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x2f,0x80]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq $-128, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x2f,0x80]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %sub = sub nsw i64 %t, -128
@@ -247,6 +368,11 @@ define i8 @sub8mi(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $-123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x07,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addb $-123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x07,0x85]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %sub = sub nsw i8 %t, 123
@@ -261,6 +387,14 @@ define i16 @sub16mi(ptr %a) {
 ; CHECK-NEXT:    # imm = 0xFB2E
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addl $-1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x2e,0xfb,0xff,0xff]
+; NF-NEXT:    # imm = 0xFB2E
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %sub = sub nsw i16 %t, 1234
@@ -273,6 +407,12 @@ define i32 @sub32mi(ptr %a) {
 ; CHECK-NEXT:    addl $-123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x07,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} addl $-123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x07,0xc0,0x1d,0xfe,0xff]
+; NF-NEXT:    # imm = 0xFFFE1DC0
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %sub = sub nsw i32 %t, 123456
@@ -285,6 +425,12 @@ define i64 @sub64mi(ptr %a) {
 ; CHECK-NEXT:    subq $-2147483648, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x2f,0x00,0x00,0x00,0x80]
 ; CHECK-NEXT:    # imm = 0x80000000
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} subq $-2147483648, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x2f,0x00,0x00,0x00,0x80]
+; NF-NEXT:    # imm = 0x80000000
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %sub = sub nsw i64 %t, -2147483648
@@ -305,6 +451,15 @@ define i8 @subflag8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subb %sil, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x28,0xf7]
+; NF-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i8 @llvm.usub.sat.i8(i8 %a, i8 %b)
     ret i8 %sub
@@ -318,6 +473,14 @@ define i16 @subflag16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subw %si, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x29,0xf7]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 %b)
     ret i16 %sub
@@ -330,6 +493,13 @@ define i32 @subflag32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK-NEXT:    subl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x29,0xf7]
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x29,0xf7]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 %b)
     ret i32 %sub
@@ -342,6 +512,13 @@ define i64 @subflag64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK-NEXT:    subq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x29,0xf7]
 ; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x29,0xf7]
+; NF-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 %b)
     ret i64 %sub
@@ -356,6 +533,15 @@ define i8 @subflag8rm(i8 noundef %a, ptr %b) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subb (%rsi), %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x2a,0x3e]
+; NF-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
     %sub = call i8 @llvm.usub.sat.i8(i8 %a, i8 %t)
@@ -370,6 +556,14 @@ define i16 @subflag16rm(i16 noundef %a, ptr %b) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subw (%rsi), %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x2b,0x3e]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
     %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 %t)
@@ -383,6 +577,13 @@ define i32 @subflag32rm(i32 noundef %a, ptr %b) {
 ; CHECK-NEXT:    subl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x2b,0x3e]
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x2b,0x3e]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 %t)
@@ -396,6 +597,13 @@ define i64 @subflag64rm(i64 noundef %a, ptr %b) {
 ; CHECK-NEXT:    subq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x2b,0x3e]
 ; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x2b,0x3e]
+; NF-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 %t)
@@ -410,6 +618,14 @@ define i16 @subflag16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subw $123, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xef,0x7b]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 123)
     ret i16 %sub
@@ -422,6 +638,13 @@ define i32 @subflag32ri8(i32 noundef %a) {
 ; CHECK-NEXT:    subl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xef,0x7b]
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xef,0x7b]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123)
     ret i32 %sub
@@ -434,6 +657,13 @@ define i64 @subflag64ri8(i64 noundef %a) {
 ; CHECK-NEXT:    subq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xef,0x7b]
 ; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xef,0x7b]
+; NF-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123)
     ret i64 %sub
@@ -448,6 +678,15 @@ define i8 @subflag8ri(i8 noundef %a) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subb $123, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xef,0x7b]
+; NF-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i8 @llvm.usub.sat.i8(i8 %a, i8 123)
     ret i8 %sub
@@ -462,6 +701,15 @@ define i16 @subflag16ri(i16 noundef %a) {
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subw $1234, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xef,0xd2,0x04]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i16 @llvm.usub.sat.i16(i16 %a, i16 1234)
     ret i16 %sub
@@ -475,6 +723,14 @@ define i32 @subflag32ri(i32 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xef,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123456)
     ret i32 %sub
@@ -488,6 +744,14 @@ define i64 @subflag64ri(i64 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: subflag64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NF-NEXT:    subq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xef,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123456)
     ret i64 %sub
@@ -513,6 +777,22 @@ define void @sub64ri_reloc(i64 %val) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:  .LBB41_2: # %f
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64ri_reloc:
+; NF:       # %bb.0:
+; NF-NEXT:    cmpq $val, %rdi # encoding: [0x48,0x81,0xff,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: val, kind: reloc_signed_4byte
+; NF-NEXT:    jbe .LBB41_2 # encoding: [0x76,A]
+; NF-NEXT:    # fixup A - offset: 1, value: .LBB41_2-1, kind: FK_PCRel_1
+; NF-NEXT:  # %bb.1: # %t
+; NF-NEXT:    pushq %rax # encoding: [0x50]
+; NF-NEXT:    .cfi_def_cfa_offset 16
+; NF-NEXT:    callq f@PLT # encoding: [0xe8,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 1, value: f@PLT-4, kind: FK_PCRel_4
+; NF-NEXT:    popq %rax # encoding: [0x58]
+; NF-NEXT:    .cfi_def_cfa_offset 8
+; NF-NEXT:  .LBB41_2: # %f
+; NF-NEXT:    retq # encoding: [0xc3]
   %cmp = icmp ugt i64 %val, ptrtoint (ptr @val to i64)
   br i1 %cmp, label %t, label %f
 
@@ -529,6 +809,11 @@ define void @sub8mr_legacy(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subb %sil, (%rdi) # encoding: [0x40,0x28,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    subb %sil, (%rdi) # encoding: [0x40,0x28,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %sub = sub i8 %t, %b
@@ -541,6 +826,11 @@ define void @sub16mr_legacy(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subw %si, (%rdi) # encoding: [0x66,0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    subw %si, (%rdi) # encoding: [0x66,0x29,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %sub = sub i16 %t, %b
@@ -553,6 +843,11 @@ define void @sub32mr_legacy(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl %esi, (%rdi) # encoding: [0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    subl %esi, (%rdi) # encoding: [0x29,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %sub = sub i32 %t, %b
@@ -565,6 +860,11 @@ define void @sub64mr_legacy(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq %rsi, (%rdi) # encoding: [0x48,0x29,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    subq %rsi, (%rdi) # encoding: [0x48,0x29,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %sub = sub i64 %t, %b
@@ -577,6 +877,11 @@ define void @sub8mi_legacy(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $-123, (%rdi) # encoding: [0x80,0x07,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub8mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addb $-123, (%rdi) # encoding: [0x80,0x07,0x85]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %sub = sub nsw i8 %t, 123
@@ -590,6 +895,12 @@ define void @sub16mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addw $-1234, (%rdi) # encoding: [0x66,0x81,0x07,0x2e,0xfb]
 ; CHECK-NEXT:    # imm = 0xFB2E
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub16mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addw $-1234, (%rdi) # encoding: [0x66,0x81,0x07,0x2e,0xfb]
+; NF-NEXT:    # imm = 0xFB2E
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %sub = sub nsw i16 %t, 1234
@@ -603,6 +914,12 @@ define void @sub32mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addl $-123456, (%rdi) # encoding: [0x81,0x07,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub32mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addl $-123456, (%rdi) # encoding: [0x81,0x07,0xc0,0x1d,0xfe,0xff]
+; NF-NEXT:    # imm = 0xFFFE1DC0
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %sub = sub nsw i32 %t, 123456
@@ -616,6 +933,12 @@ define void @sub64mi_legacy(ptr %a) {
 ; CHECK-NEXT:    addq $-123456, (%rdi) # encoding: [0x48,0x81,0x07,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: sub64mi_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    addq $-123456, (%rdi) # encoding: [0x48,0x81,0x07,0xc0,0x1d,0xfe,0xff]
+; NF-NEXT:    # imm = 0xFFFE1DC0
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %sub = sub nsw i64 %t, 123456
diff --git a/llvm/test/CodeGen/X86/apx/xor.ll b/llvm/test/CodeGen/X86/apx/xor.ll
index d203fbb02782a..436b16b4292df 100644
--- a/llvm/test/CodeGen/X86/apx/xor.ll
+++ b/llvm/test/CodeGen/X86/apx/xor.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @xor8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: xor8rr:
@@ -7,6 +8,12 @@ define i8 @xor8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-NEXT:    xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7]
+; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i8 %a, %b
     ret i8 %xor
@@ -18,6 +25,12 @@ define i16 @xor16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-NEXT:    xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i16 %a, %b
     ret i16 %xor
@@ -28,6 +41,11 @@ define i32 @xor32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i32 %a, %b
     ret i32 %xor
@@ -38,6 +56,11 @@ define i64 @xor64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x31,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64rr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq %rsi, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x31,0xf7]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i64 %a, %b
     ret i64 %xor
@@ -48,6 +71,11 @@ define i8 @xor8rm(i8 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x32,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x32,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
     %xor = xor i8 %a, %t
@@ -59,6 +87,11 @@ define i16 @xor16rm(i16 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x33,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x33,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
     %xor = xor i16 %a, %t
@@ -70,6 +103,11 @@ define i32 @xor32rm(i32 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x33,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x33,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
     %xor = xor i32 %a, %t
@@ -81,6 +119,11 @@ define i64 @xor64rm(i64 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x33,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64rm:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x33,0x3e]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
     %xor = xor i64 %a, %t
@@ -93,6 +136,12 @@ define i16 @xor16ri8(i16 noundef %a) {
 ; CHECK-NEXT:    xorl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xf7,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xf7,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i16 %a, 123
     ret i16 %xor
@@ -103,6 +152,11 @@ define i32 @xor32ri8(i32 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xf7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $123, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0xf7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i32 %a, 123
     ret i32 %xor
@@ -113,6 +167,11 @@ define i64 @xor64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xf7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64ri8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq $123, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xf7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i64 %a, 123
     ret i64 %xor
@@ -123,6 +182,11 @@ define i8 @xor8ri(i8 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xf7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorb $123, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0xf7,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i8 %a, 123
     ret i8 %xor
@@ -135,6 +199,13 @@ define i16 @xor16ri(i16 noundef %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $1234, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xf7,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i16 %a, 1234
     ret i16 %xor
@@ -146,6 +217,12 @@ define i32 @xor32ri(i32 noundef %a) {
 ; CHECK-NEXT:    xorl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $123456, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i32 %a, 123456
     ret i32 %xor
@@ -157,6 +234,12 @@ define i64 @xor64ri(i64 noundef %a) {
 ; CHECK-NEXT:    xorq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64ri:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq $123456, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i64 %a, 123456
     ret i64 %xor
@@ -167,6 +250,11 @@ define i8 @xor8mr(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x30,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x30,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %xor = xor i8 %t, %b
@@ -178,6 +266,11 @@ define i16 @xor16mr(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %xor = xor i16 %t, %b
@@ -189,6 +282,11 @@ define i32 @xor32mr(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %xor = xor i32 %t, %b
@@ -200,6 +298,11 @@ define i64 @xor64mr(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64mr:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %xor = xor i64 %t, %b
@@ -213,6 +316,13 @@ define i16 @xor16mi8(ptr %a) {
 ; CHECK-NEXT:    xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b]
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %xor = xor i16 %t, 123
@@ -224,6 +334,11 @@ define i32 @xor32mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x37,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x37,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %xor = xor i32 %t, 123
@@ -235,6 +350,11 @@ define i64 @xor64mi8(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x37,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64mi8:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x37,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %xor = xor i64 %t, 123
@@ -246,6 +366,11 @@ define i8 @xor8mi(ptr %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x37,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x37,0x7b]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %xor = xor i8 %t, 123
@@ -260,6 +385,14 @@ define i16 @xor16mi(ptr %a) {
 ; CHECK-NEXT:    # imm = 0x4D2
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    xorl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x35,0xd2,0x04,0x00,0x00]
+; NF-NEXT:    # imm = 0x4D2
+; NF-NEXT:    # kill: def $ax killed $ax killed $eax
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %xor = xor i16 %t, 1234
@@ -272,6 +405,12 @@ define i32 @xor32mi(ptr %a) {
 ; CHECK-NEXT:    xorl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x37,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x37,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %xor = xor i32 %t, 123456
@@ -284,6 +423,12 @@ define i64 @xor64mi(ptr %a) {
 ; CHECK-NEXT:    xorq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x37,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64mi:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    {nf} xorq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x37,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %xor = xor i64 %t, 123456
@@ -301,6 +446,15 @@ define i1 @xorflag8rr(i8 %a, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag8rr:
+; NF:       # %bb.0:
+; NF-NEXT:    {nf} xorl %edi, %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xfe]
+; NF-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 %b, -1
   %v0 = xor i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -317,6 +471,15 @@ define i1 @xorflag16rr(i16 %a, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag16rr:
+; NF:       # %bb.0:
+; NF-NEXT:    {nf} xorl %edi, %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xfe]
+; NF-NEXT:    xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 %b, -1
   %v0 = xor i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -332,6 +495,14 @@ define i1 @xorflag32rr(i32 %a, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag32rr:
+; NF:       # %bb.0:
+; NF-NEXT:    xorl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x31,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -346,6 +517,14 @@ define i1 @xorflag64rr(i64 %a, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag64rr:
+; NF:       # %bb.0:
+; NF-NEXT:    xorq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x31,0xf7]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -361,6 +540,15 @@ define i1 @xorflag8rm(ptr %ptr, i8 %b) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag8rm:
+; NF:       # %bb.0:
+; NF-NEXT:    {nf} xorb (%rdi), %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x32,0x37]
+; NF-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
   %xor = xor i8 %b, -1
   %v0 = xor i8 %a, %xor  ; 0xff << 50
@@ -378,6 +566,15 @@ define i1 @xorflag16rm(ptr %ptr, i16 %b) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag16rm:
+; NF:       # %bb.0:
+; NF-NEXT:    {nf} xorw (%rdi), %si, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x33,0x37]
+; NF-NEXT:    xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %xor = xor i16 %b, -1
   %v0 = xor i16 %a, %xor  ; 0xff << 50
@@ -394,6 +591,14 @@ define i1 @xorflag32rm(ptr %ptr, i32 %b) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag32rm:
+; NF:       # %bb.0:
+; NF-NEXT:    xorl (%rdi), %esi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x33,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %v0 = xor i32 %a, %b  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
@@ -409,6 +614,14 @@ define i1 @xorflag64rm(ptr %ptr, i64 %b) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag64rm:
+; NF:       # %bb.0:
+; NF-NEXT:    xorq (%rdi), %rsi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x33,0x37]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %v0 = xor i64 %a, %b  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
@@ -424,6 +637,14 @@ define i1 @xorflag8ri(i8 %a) {
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag8ri:
+; NF:       # %bb.0:
+; NF-NEXT:    xorb $-124, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf7,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i8 123, -1
   %v0 = xor i8 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i8 %v0, 0
@@ -440,6 +661,15 @@ define i1 @xorflag16ri(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag16ri:
+; NF:       # %bb.0:
+; NF-NEXT:    xorw $-1235, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xf7,0x2d,0xfb]
+; NF-NEXT:    # imm = 0xFB2D
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 1234, -1
   %v0 = xor i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -456,6 +686,15 @@ define i1 @xorflag32ri(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag32ri:
+; NF:       # %bb.0:
+; NF-NEXT:    xorl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i32 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -471,6 +710,15 @@ define i1 @xorflag64ri(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag64ri:
+; NF:       # %bb.0:
+; NF-NEXT:    xorq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    # imm = 0x1E240
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i64 %a, 123456  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -485,6 +733,14 @@ define i1 @xorflag16ri8(i16 %a) {
 ; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag16ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    xorw $-124, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf7,0x84]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %xor = xor i16 123, -1
   %v0 = xor i16 %a, %xor  ; 0xff << 50
   %v1 = icmp eq i16 %v0, 0
@@ -500,6 +756,14 @@ define i1 @xorflag32ri8(i32 %a) {
 ; CHECK-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag32ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    xorl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xf7,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movl %ecx, d64(%rip) # encoding: [0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i32 %a, 123  ; 0xff << 50
   %v1 = icmp eq i32 %v0, 0
   store i32 %v0, ptr @d64
@@ -514,6 +778,14 @@ define i1 @xorflag64ri8(i64 %a) {
 ; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xorflag64ri8:
+; NF:       # %bb.0:
+; NF-NEXT:    xorq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xf7,0x7b]
+; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NF-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NF-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NF-NEXT:    retq # encoding: [0xc3]
   %v0 = xor i64 %a, 123  ; 0xff << 50
   %v1 = icmp eq i64 %v0, 0
   store i64 %v0, ptr @d64
@@ -525,6 +797,11 @@ define void @xor8mr_legacy(ptr %a, i8 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorb %sil, (%rdi) # encoding: [0x40,0x30,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor8mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorb %sil, (%rdi) # encoding: [0x40,0x30,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
   %xor = xor i8 %t, %b
@@ -537,6 +814,11 @@ define void @xor16mr_legacy(ptr %a, i16 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorw %si, (%rdi) # encoding: [0x66,0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor16mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorw %si, (%rdi) # encoding: [0x66,0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
   %xor = xor i16 %t, %b
@@ -549,6 +831,11 @@ define void @xor32mr_legacy(ptr %a, i32 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %esi, (%rdi) # encoding: [0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor32mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorl %esi, (%rdi) # encoding: [0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
   %xor = xor i32 %t, %b
@@ -561,6 +848,11 @@ define void @xor64mr_legacy(ptr %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorq %rsi, (%rdi) # encoding: [0x48,0x31,0x37]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; NF-LABEL: xor64mr_legacy:
+; NF:       # %bb.0: # %entry
+; NF-NEXT:    xorq %rsi, (%rdi) # encoding: [0x48,0x31,0x37]
+; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
   %xor = xor i64 %t, %b
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index 9995e7d3a4d31..d7633cb11e44c 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -787,3 +787,100 @@ define double @load_double_seq_cst(ptr %fptr) {
   %v = load atomic double, ptr %fptr seq_cst, align 8
   ret double %v
 }
+
+define void @store_bfloat(ptr %fptr, bfloat %v) {
+; X86-LABEL: store_bfloat:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    retl
+;
+; X64-SSE-LABEL: store_bfloat:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pextrw $0, %xmm0, %eax
+; X64-SSE-NEXT:    movw %ax, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: store_bfloat:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpextrw $0, %xmm0, %eax
+; X64-AVX-NEXT:    movw %ax, (%rdi)
+; X64-AVX-NEXT:    retq
+  store atomic bfloat %v, ptr %fptr unordered, align 2
+  ret void
+}
+
+; Work around issue #92899 by casting to float
+define float @load_bfloat(ptr %fptr) {
+; X86-SSE1-LABEL: load_bfloat:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movzwl (%eax), %eax
+; X86-SSE1-NEXT:    shll $16, %eax
+; X86-SSE1-NEXT:    movl %eax, (%esp)
+; X86-SSE1-NEXT:    flds (%esp)
+; X86-SSE1-NEXT:    popl %eax
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: load_bfloat:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movzwl (%eax), %eax
+; X86-SSE2-NEXT:    shll $16, %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, (%esp)
+; X86-SSE2-NEXT:    flds (%esp)
+; X86-SSE2-NEXT:    popl %eax
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: load_bfloat:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzwl (%eax), %eax
+; X86-AVX-NEXT:    shll $16, %eax
+; X86-AVX-NEXT:    vmovd %eax, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, (%esp)
+; X86-AVX-NEXT:    flds (%esp)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX-NEXT:    retl
+;
+; X86-NOSSE-LABEL: load_bfloat:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movzwl (%eax), %eax
+; X86-NOSSE-NEXT:    shll $16, %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    flds (%esp)
+; X86-NOSSE-NEXT:    popl %eax
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-NOSSE-NEXT:    retl
+;
+; X64-SSE-LABEL: load_bfloat:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movzwl (%rdi), %eax
+; X64-SSE-NEXT:    shll $16, %eax
+; X64-SSE-NEXT:    movd %eax, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: load_bfloat:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movzwl (%rdi), %eax
+; X64-AVX-NEXT:    shll $16, %eax
+; X64-AVX-NEXT:    vmovd %eax, %xmm0
+; X64-AVX-NEXT:    retq
+  %v = load atomic bfloat, ptr %fptr unordered, align 2
+  %ext = fpext bfloat %v to float
+  ret float %ext
+}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index c981d973fef3e..bad0b411f68a9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512F
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512BW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512BW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512BW
 
 target triple = "x86_64-unknown-unknown"
 
@@ -14,21 +16,33 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
 }
 
 define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) {
-; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
-; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; SLOW-NEXT:    vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x float> %shuffle
 }
 
 define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) {
-; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
-; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; SLOW-NEXT:    vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %tmp0 = bitcast <16 x i32> %a to <16 x float>
   %tmp1 = bitcast <16 x i32> %b to <16 x float>
   %shuffle = shufflevector <16 x float> %tmp0, <16 x float> %tmp1, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -196,11 +210,20 @@ define <16 x float> @shuffle_f32_v16f32_00_08_01_09_02_10_03_11_04_12_05_13_06_1
 
 ; PR86076
 define <16 x float> @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08(float %a0, float %a1) {
-; ALL-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; SLOW-NEXT:    vbroadcastsd %xmm0, %zmm0
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
+; FAST:       # %bb.0:
+; FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; FAST-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
+; FAST-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
+; FAST-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; FAST-NEXT:    retq
   %v0 = insertelement <8 x float> poison, float %a0, i64 0
   %v1 = insertelement <8 x float> poison, float %a1, i64 0
   %sv = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
@@ -217,11 +240,17 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 }
 
 define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) {
-; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; SLOW-NEXT:    vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   ret <16 x i32> %shuffle
 }
@@ -302,21 +331,33 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08
 
 ; PR46249
 define <16 x i32> @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x i32> %a) {
-; ALL-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; ALL-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %1 = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   ret <16 x i32> %1
 }
 
 define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x float> %a) {
-; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; ALL-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SLOW-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   ret <16 x float> %1
 }
@@ -333,11 +374,17 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
 }
 
 define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a0, ptr %a1) {
-; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
-; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
+; SLOW-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
+; FAST-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
+; FAST-NEXT:    retq
   %1 = load <16 x float>, ptr %a1
   %2 = shufflevector <16 x float> %1, <16 x float> %a0, <16 x i32> <i32 16, i32 19, i32 18, i32 0, i32 20, i32 23, i32 22, i32 4, i32 24, i32 27, i32 26, i32 8, i32 28, i32 31, i32 30, i32 12>
   ret <16 x float> %2
@@ -365,26 +412,41 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
 
 ;FIXME: can do better with vpcompress
 define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
-; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; ALL-NEXT:    retq
+; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; FAST-NEXT:    retq
   %res = shufflevector <16 x i32> %v, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
   ret <8 x i32> %res
 }
 
 ;FIXME: can do better with vpcompress
 define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
-; ALL-LABEL: test_v16i32_0_1_2_12:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT:    vbroadcastss %xmm1, %xmm1
-; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
+; SLOW-LABEL: test_v16i32_0_1_2_12:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; SLOW-NEXT:    vbroadcastss %xmm1, %xmm1
+; SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; SLOW-NEXT:    vzeroupper
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: test_v16i32_0_1_2_12:
+; FAST:       # %bb.0:
+; FAST-NEXT:    vmovaps {{.*#+}} xmm1 = [0,1,2,12]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; FAST-NEXT:    vzeroupper
+; FAST-NEXT:    retq
   %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 12>
   ret <4 x i32> %res
 }
@@ -568,11 +630,18 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12
 }
 
 define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) {
-; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
-; ALL-NEXT:    retq
+; SLOW-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; SLOW:       # %bb.0:
+; SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; SLOW-NEXT:    vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT:    retq
+;
+; FAST-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; FAST:       # %bb.0:
+; FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; FAST-NEXT:    vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; FAST-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   ret <16 x float> %shuffle
 }
diff --git a/llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s b/llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s
new file mode 100644
index 0000000000000..fd04f569526b9
--- /dev/null
+++ b/llvm/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_xsec_branch.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-rtdyld -triple=arm64-none-linux-gnu -verify -check=%s %t
+
+.globl _main
+.weak _label1
+
+.section .text.label1,"ax"
+_label1:
+        nop
+
+.section .text.main,"ax"
+_main:
+        b _label1
+
+# Branch must be to stub in .text.main, *not* back to _label1, because
+# in general sections could be loaded at arbitrary addresses in target memory,
+# and when initially processing locations and generating stubs we don't know
+# the final layout yet, so we can't tell if the branch offset is within range.
+
+# rtdyld-check: *{4}(_main) = 0x14000001
diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
index 069b71b7229cd..cc10d3400e9b1 100644
--- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-diag-v4.s
@@ -54,7 +54,7 @@
 
 // GCN-LABEL: warning: test_amdhsa_group_segment_fixed_size_repeated
 // AMDHSA: error: .amdhsa_ directives cannot be repeated
-// NONAMDHSA-: error: unknown directive
+// NONAMDHSA: error: unknown directive
 .warning "test_amdhsa_group_segment_fixed_size_repeated"
 .amdhsa_kernel test_amdhsa_group_segment_fixed_size_repeated
   .amdhsa_group_segment_fixed_size 1
diff --git a/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s b/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s
index a083b17aa54fe..e1bb229804209 100644
--- a/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s
+++ b/llvm/test/MC/AsmParser/macro-at-pseudo-variable.s
@@ -74,15 +74,15 @@
 
 #--- b.s
 .rept 2
-  .print "r\+"
+  .print "r\+ \+"
 .endr
 .irpc foo,12
-  .print "\+i"
+  .print "\+\+i"
 .endr
-# CHECK2:      r0
-# CHECK2-NEXT: r1
-# CHECK2-NEXT: 0i
-# CHECK2-NEXT: 1i
+# CHECK2:      r0 0
+# CHECK2-NEXT: r1 1
+# CHECK2-NEXT: 00i
+# CHECK2-NEXT: 11i
 
 .rept 2
   .rept 2
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt
index 7d15f041bd770..78ca1bbdacf29 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt
@@ -91,20 +91,20 @@
 
 # FIXME: Results in invalid v_subrev_u16_dpp which apparently has the same encoding but does not exist in GFX10
 
-# gfx1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-# gfx1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 # 0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00
 
 # FIXME: Results in v_mul_lo_u16_dpp
 
-# gfx1032: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-# gfx1064: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1032: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1064: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 # 0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x00
 
 # FIXME: gives v_lshlrev_b16_dpp
 
-# gfx1032: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-# gfx1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1032: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+# COM: GFX1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc  quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 # 0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00
 
 # GFX1032: v_add_co_u32 v0, s0, v0, v2
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
index 36c58d4c67326..473ede00603a7 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
@@ -1674,7 +1674,7 @@
 # GFX12: ds_pk_add_f16 v0, v0 offset:4660        ; encoding: [0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00]
 0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00
 
-# gfx12: ds_pk_add_bf16 v2, v1                   ; encoding: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00]
+# GFX12: ds_pk_add_bf16 v2, v1                   ; encoding: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00]
 0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00
 
 # GFX12: ds_pk_add_f16 v0, v0 offset:4660        ; encoding: [0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/WebAssembly/basic-assembly.s b/llvm/test/MC/WebAssembly/basic-assembly.s
index 769cd7edfa8a3..ac358c1b5c7a5 100644
--- a/llvm/test/MC/WebAssembly/basic-assembly.s
+++ b/llvm/test/MC/WebAssembly/basic-assembly.s
@@ -146,12 +146,14 @@ test0:
 
     .ident      "clang version 9.0.0 (trunk 364502) (llvm/trunk 364571)"
 
-.tabletype empty_eref_table, externref
-empty_eref_table:
+.tabletype empty_externref_table, externref
+empty_externref_table:
 
-.tabletype empty_fref_table, funcref
-empty_fref_table:
+.tabletype empty_funcref_table, funcref
+empty_funcref_table:
 
+.tabletype empty_exnref_table, exnref
+empty_exnref_table:
 
 # CHECK:           .text
 # CHECK:           .globaltype __stack_pointer, i32
@@ -283,8 +285,11 @@ empty_fref_table:
 # CHECK-NEXT:      .p2align    2
 # CHECK-NEXT:      .int32      test0
 
-# CHECK:           .tabletype empty_eref_table, externref
-# CHECK-NEXT: empty_eref_table:
+# CHECK:           .tabletype empty_externref_table, externref
+# CHECK-NEXT: empty_externref_table:
 
-# CHECK:           .tabletype empty_fref_table, funcref
-# CHECK-NEXT: empty_fref_table:
+# CHECK:           .tabletype empty_funcref_table, funcref
+# CHECK-NEXT: empty_funcref_table:
+
+# CHECK:           .tabletype empty_exnref_table, exnref
+# CHECK-NEXT: empty_exnref_table:
diff --git a/llvm/test/MC/WebAssembly/reference-types.s b/llvm/test/MC/WebAssembly/reference-types.s
index ab3e3ee6b155b..2f8bfba68dcea 100644
--- a/llvm/test/MC/WebAssembly/reference-types.s
+++ b/llvm/test/MC/WebAssembly/reference-types.s
@@ -4,22 +4,27 @@
 # CHECK-LABEL:ref_is_null:
 # CHECK: ref.is_null     # encoding: [0xd1]
 ref_is_null:
-  .functype ref_is_null () -> (i32, i32)
+  .functype ref_is_null () -> (i32, i32, i32)
   ref.null_extern
   ref.is_null
   ref.null_func
   ref.is_null
+  ref.null_exn
+  ref.is_null
   end_function
 
 # CHECK-LABEL: ref_null_test:
 # CHECK: ref.null_func   # encoding: [0xd0,0x70]
 # CHECK: ref.null_extern # encoding: [0xd0,0x6f]
+# CHECK: ref.null_exn    # encoding: [0xd0,0x69]
 ref_null_test:
   .functype ref_null_test () -> ()
   ref.null_func
   drop
   ref.null_extern
   drop
+  ref.null_exn
+  drop
   end_function
 
 # CHECK-LABEL: ref_sig_test_funcref:
@@ -36,9 +41,17 @@ ref_sig_test_externref:
   local.get 0
   end_function
 
+# CHECK-LABEL: ref_sig_test_exnref:
+# CHECK-NEXT: .functype ref_sig_test_exnref (exnref) -> (exnref)
+ref_sig_test_exnref:
+  .functype ref_sig_test_exnref (exnref) -> (exnref)
+  local.get 0
+  end_function
+
 # CHECK-LABEL: ref_select_test:
 # CHECK: funcref.select   # encoding: [0x1b]
 # CHECK: externref.select # encoding: [0x1b]
+# CHECK: exnref.select    # encoding: [0x1b]
 ref_select_test:
   .functype ref_select_test () -> ()
   ref.null_func
@@ -51,15 +64,24 @@ ref_select_test:
   i32.const 0
   externref.select
   drop
+  ref.null_exn
+  ref.null_exn
+  i32.const 0
+  exnref.select
+  drop
   end_function
 
 # CHECK-LABEL: ref_block_test:
 # CHECK: block funcref
 # CHECK: block externref
+# CHECK: block exnref
 ref_block_test:
-  .functype ref_block_test () -> (externref, funcref)
+  .functype ref_block_test () -> (exnref, externref, funcref)
   block funcref
   block externref
+  block exnref
+  ref.null_exn
+  end_block
   ref.null_extern
   end_block
   ref.null_func
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index d397188a9882e..113a23da776fa 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -851,4 +851,28 @@ main:
     # CHECK: f16x8.extract_lane 1 # encoding: [0xfd,0xa1,0x02,0x01]
     f16x8.extract_lane 1
 
+    # CHECK: f16x8.add # encoding: [0xfd,0xb4,0x02]
+    f16x8.add
+
+    # CHECK: f16x8.sub # encoding: [0xfd,0xb5,0x02]
+    f16x8.sub
+
+    # CHECK: f16x8.mul # encoding: [0xfd,0xb6,0x02]
+    f16x8.mul
+
+    # CHECK: f16x8.div # encoding: [0xfd,0xb7,0x02]
+    f16x8.div
+
+    # CHECK: f16x8.min # encoding: [0xfd,0xb8,0x02]
+    f16x8.min
+
+    # CHECK: f16x8.max # encoding: [0xfd,0xb9,0x02]
+    f16x8.max
+
+    # CHECK: f16x8.pmin # encoding: [0xfd,0xba,0x02]
+    f16x8.pmin
+
+    # CHECK: f16x8.pmax # encoding: [0xfd,0xbb,0x02]
+    f16x8.pmax
+
     end_function
diff --git a/llvm/test/MC/WebAssembly/type-checker-errors.s b/llvm/test/MC/WebAssembly/type-checker-errors.s
index 5e28d117501e9..d2841250137a8 100644
--- a/llvm/test/MC/WebAssembly/type-checker-errors.s
+++ b/llvm/test/MC/WebAssembly/type-checker-errors.s
@@ -215,6 +215,22 @@ table_fill_type_mismatch_3:
   table.fill valid_table
   end_function
 
+table_fill_type_mismatch_4:
+  .functype table_fill_type_mismatch_4 () -> ()
+  ref.null_exn
+  i32.const 1
+# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref
+  table.fill valid_table
+  end_function
+
+table_fill_type_mismatch_5:
+  .functype table_fill_type_mismatch_5 () -> ()
+  ref.null_exn
+  i32.const 1
+# CHECK: [[@LINE+1]]:3: error: popped exnref, expected externref
+  table.fill valid_table
+  end_function
+
 table_grow_non_exist_table:
   .functype table_grow_non_exist_table (externref, i32) -> (i32)
   local.get 0
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
index 72109d0cff437..4290e4f705887 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
@@ -1,34 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s
 ; Ports of most of test/CodeGen/NVPTX/access-non-generic.ll
 
 @scalar = internal addrspace(3) global float 0.0, align 4
 @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
 
-; CHECK-LABEL: @load_store_lds_f32(
-; CHECK: %tmp = load float, ptr addrspace(3) @scalar, align 4
-; CHECK: call void @use(float %tmp)
-; CHECK: store float %v, ptr addrspace(3) @scalar, align 4
-; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: %tmp2 = load float, ptr addrspace(3) @scalar, align 4
-; CHECK: call void @use(float %tmp2)
-; CHECK: store float %v, ptr addrspace(3) @scalar, align 4
-; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: %tmp3 = load float, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4
-; CHECK: call void @use(float %tmp3)
-; CHECK: store float %v, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4
-; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: %tmp4 = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 5
-; CHECK: %tmp5 = load float, ptr addrspace(3) %tmp4, align 4
-; CHECK: call void @use(float %tmp5)
-; CHECK: store float %v, ptr addrspace(3) %tmp4, align 4
-; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: %tmp7 = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 %i
-; CHECK: %tmp8 = load float, ptr addrspace(3) %tmp7, align 4
-; CHECK: call void @use(float %tmp8)
-; CHECK: store float %v, ptr addrspace(3) %tmp7, align 4
-; CHECK: call void @llvm.amdgcn.s.barrier()
-; CHECK: ret void
 define amdgpu_kernel void @load_store_lds_f32(i32 %i, float %v) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @load_store_lds_f32(
+; CHECK-SAME: i32 [[I:%.*]], float [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = load float, ptr addrspace(3) @scalar, align 4
+; CHECK-NEXT:    call void @use(float [[TMP]])
+; CHECK-NEXT:    store float [[V]], ptr addrspace(3) @scalar, align 4
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(3) @scalar, align 4
+; CHECK-NEXT:    call void @use(float [[TMP2]])
+; CHECK-NEXT:    store float [[V]], ptr addrspace(3) @scalar, align 4
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4
+; CHECK-NEXT:    call void @use(float [[TMP3]])
+; CHECK-NEXT:    store float [[V]], ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i32 0, i32 5), align 4
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(3) [[TMP4]], align 4
+; CHECK-NEXT:    call void @use(float [[TMP5]])
+; CHECK-NEXT:    store float [[V]], ptr addrspace(3) [[TMP4]], align 4
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [10 x float], ptr addrspace(3) @array, i32 0, i32 [[I]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT:    call void @use(float [[TMP8]])
+; CHECK-NEXT:    store float [[V]], ptr addrspace(3) [[TMP7]], align 4
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    ret void
+;
 bb:
   %tmp = load float, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4
   call void @use(float %tmp)
@@ -57,20 +61,27 @@ bb:
   ret void
 }
 
-; CHECK-LABEL: @constexpr_load_int_from_float_lds(
-; CHECK: %tmp = load i32, ptr addrspace(3) @scalar, align 4
 define i32 @constexpr_load_int_from_float_lds() #0 {
+; CHECK-LABEL: define i32 @constexpr_load_int_from_float_lds(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr addrspace(3) @scalar, align 4
+; CHECK-NEXT:    ret i32 [[TMP]]
+;
 bb:
   %tmp = load i32, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4
   ret i32 %tmp
 }
 
-; CHECK-LABEL: @load_int_from_global_float(
-; CHECK: %tmp1 = getelementptr float, ptr addrspace(1) %input, i32 %i
-; CHECK: %tmp2 = getelementptr float, ptr addrspace(1) %tmp1, i32 %j
-; CHECK: %tmp4 = load i32, ptr addrspace(1) %tmp2
-; CHECK: ret i32 %tmp4
 define i32 @load_int_from_global_float(ptr addrspace(1) %input, i32 %i, i32 %j) #0 {
+; CHECK-LABEL: define i32 @load_int_from_global_float(
+; CHECK-SAME: ptr addrspace(1) [[INPUT:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i32 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP1]], i32 [[J]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
 bb:
   %tmp = addrspacecast ptr addrspace(1) %input to ptr
   %tmp1 = getelementptr float, ptr %tmp, i32 %i
@@ -79,20 +90,26 @@ bb:
   ret i32 %tmp4
 }
 
-; CHECK-LABEL: @nested_const_expr(
-; CHECK: store i32 1, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i64 0, i64 1), align 4
 define amdgpu_kernel void @nested_const_expr() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @nested_const_expr(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 1, ptr addrspace(3) getelementptr inbounds ([10 x float], ptr addrspace(3) @array, i64 0, i64 1), align 4
+; CHECK-NEXT:    ret void
+;
   store i32 1, ptr bitcast (ptr getelementptr ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i64 0, i64 1) to ptr), align 4
 
   ret void
 }
 
-; CHECK-LABEL: @rauw(
-; CHECK: %addr = getelementptr float, ptr addrspace(1) %input, i64 10
-; CHECK-NEXT: %v = load float, ptr addrspace(1) %addr
-; CHECK-NEXT: store float %v, ptr addrspace(1) %addr
-; CHECK-NEXT: ret void
 define amdgpu_kernel void @rauw(ptr addrspace(1) %input) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @rauw(
+; CHECK-SAME: ptr addrspace(1) [[INPUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr float, ptr addrspace(1) [[INPUT]], i64 10
+; CHECK-NEXT:    [[V:%.*]] = load float, ptr addrspace(1) [[ADDR]], align 4
+; CHECK-NEXT:    store float [[V]], ptr addrspace(1) [[ADDR]], align 4
+; CHECK-NEXT:    ret void
+;
 bb:
   %generic_input = addrspacecast ptr addrspace(1) %input to ptr
   %addr = getelementptr float, ptr %generic_input, i64 10
@@ -102,20 +119,22 @@ bb:
 }
 
 ; FIXME: Should be able to eliminate the cast inside the loop
-; CHECK-LABEL: @loop(
-
-; CHECK: %end = getelementptr float, ptr addrspace(3) @array, i64 10
-; CHECK: br label %loop
-
-; CHECK: loop:                                             ; preds = %loop, %entry
-; CHECK: %i = phi ptr addrspace(3) [ @array, %entry ], [ %i2, %loop ]
-; CHECK: %v = load float, ptr addrspace(3) %i
-; CHECK: call void @use(float %v)
-; CHECK: %i2 = getelementptr float, ptr addrspace(3) %i, i64 1
-; CHECK: %exit_cond = icmp eq ptr addrspace(3) %i2, %end
-
-; CHECK: br i1 %exit_cond, label %exit, label %loop
 define amdgpu_kernel void @loop() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @loop(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[END:%.*]] = getelementptr float, ptr addrspace(3) @array, i64 10
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi ptr addrspace(3) [ @array, %[[ENTRY]] ], [ [[I2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[V:%.*]] = load float, ptr addrspace(3) [[I]], align 4
+; CHECK-NEXT:    call void @use(float [[V]])
+; CHECK-NEXT:    [[I2]] = getelementptr float, ptr addrspace(3) [[I]], i64 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq ptr addrspace(3) [[I2]], [[END]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %p = addrspacecast ptr addrspace(3) @array to ptr
   %end = getelementptr float, ptr %p, i64 10
@@ -135,19 +154,23 @@ exit:                                             ; preds = %loop
 
 @generic_end = external addrspace(1) global ptr
 
-; CHECK-LABEL: @loop_with_generic_bound(
-; CHECK: %end = load ptr, ptr addrspace(1) @generic_end
-; CHECK: br label %loop
-
-; CHECK: loop:
-; CHECK: %i = phi ptr addrspace(3) [ @array, %entry ], [ %i2, %loop ]
-; CHECK: %v = load float, ptr addrspace(3) %i
-; CHECK: call void @use(float %v)
-; CHECK: %i2 = getelementptr float, ptr addrspace(3) %i, i64 1
-; CHECK: %0 = addrspacecast ptr addrspace(3) %i2 to ptr
-; CHECK: %exit_cond = icmp eq ptr %0, %end
-; CHECK: br i1 %exit_cond, label %exit, label %loop
 define amdgpu_kernel void @loop_with_generic_bound() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @loop_with_generic_bound(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[END:%.*]] = load ptr, ptr addrspace(1) @generic_end, align 8
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi ptr addrspace(3) [ @array, %[[ENTRY]] ], [ [[I2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[V:%.*]] = load float, ptr addrspace(3) [[I]], align 4
+; CHECK-NEXT:    call void @use(float [[V]])
+; CHECK-NEXT:    [[I2]] = getelementptr float, ptr addrspace(3) [[I]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq ptr [[TMP0]], [[END]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %p = addrspacecast ptr addrspace(3) @array to ptr
   %end = load ptr, ptr addrspace(1) @generic_end
@@ -165,11 +188,14 @@ exit:                                             ; preds = %loop
   ret void
 }
 
-; CHECK-LABEL: @select_bug(
-; CHECK: %sel = select i1 icmp ne (ptr inttoptr (i64 4873 to ptr), ptr null), i64 73, i64 93
-; CHECK: %add.ptr157 = getelementptr inbounds i64, ptr undef, i64 %sel
-; CHECK: %cmp169 = icmp uge ptr undef, %add.ptr157
 define void @select_bug() #0 {
+; CHECK-LABEL: define void @select_bug(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 icmp ne (ptr inttoptr (i64 4873 to ptr), ptr null), i64 73, i64 93
+; CHECK-NEXT:    [[ADD_PTR157:%.*]] = getelementptr inbounds i64, ptr undef, i64 [[SEL]]
+; CHECK-NEXT:    [[CMP169:%.*]] = icmp uge ptr undef, [[ADD_PTR157]]
+; CHECK-NEXT:    unreachable
+;
   %sel = select i1 icmp ne (ptr inttoptr (i64 4873 to ptr), ptr null), i64 73, i64 93
   %add.ptr157 = getelementptr inbounds i64, ptr undef, i64 %sel
   %cmp169 = icmp uge ptr undef, %add.ptr157
diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll
index e6b517a73fa46..23c5f99e5d086 100644
--- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -mtriple=nvptx64-nvidia-cuda -passes=infer-address-spaces %s | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
@@ -6,18 +7,23 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 
 @var1 = local_unnamed_addr addrspace(3) externally_initialized global %struct.bar undef, align 8
 
-; CHECK-LABEL: @bug31948(
-; CHECK: %tmp = load ptr, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @var1, i64 0, i32 1), align 8
-; CHECK: %tmp1 = load float, ptr %tmp, align 4
-; CHECK: store float %conv1, ptr %tmp, align 4
-; CHECK: store i32 32, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @var1, i64 0, i32 1), align 4
 define void @bug31948(float %a, ptr nocapture readnone %x, ptr nocapture readnone %y) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @bug31948(
+; CHECK-SAME: float [[A:%.*]], ptr nocapture readnone [[X:%.*]], ptr nocapture readnone [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = load ptr, ptr addrspace(3) getelementptr inbounds ([[STRUCT_BAR:%.*]], ptr addrspace(3) @var1, i64 0, i32 1), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP]], align 4
+; CHECK-NEXT:    [[CONV1:%.*]] = fadd float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    store float [[CONV1]], ptr [[TMP]], align 4
+; CHECK-NEXT:    store i32 32, ptr addrspace(3) getelementptr inbounds ([[STRUCT_BAR]], ptr addrspace(3) @var1, i64 0, i32 1), align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %tmp = load ptr, ptr getelementptr (%struct.bar, ptr addrspacecast (ptr addrspace(3) @var1 to ptr), i64 0, i32 1), align 8
   %tmp1 = load float, ptr %tmp, align 4
   %conv1 = fadd float %tmp1, 1.000000e+00
   store float %conv1, ptr %tmp, align 4
-  store i32 32, ptr bitcast (ptr getelementptr (%struct.bar, ptr addrspacecast (ptr addrspace(3) @var1 to ptr), i64 0, i32 1) to ptr), align 4
+  store i32 32, ptr getelementptr (%struct.bar, ptr addrspacecast (ptr addrspace(3) @var1 to ptr), i64 0, i32 1), align 4
   ret void
 }
 
diff --git a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
index c038ffccf3e96..f4cebf1fcb5da 100644
--- a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
+++ b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
@@ -5,10 +5,12 @@
 
 declare ptr @foo()
 declare void @use.ptr(ptr) willreturn nounwind
+declare void @use.val(i8) willreturn nounwind
 declare void @bar()
 declare void @baz()
 declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
 declare i1 @val()
+declare i8 @val8()
 
 define ptr @callee0123() {
 ; CHECK-LABEL: define ptr @callee0123() {
@@ -337,3 +339,74 @@ define ptr @caller12_todo() {
   %r = call nonnull ptr @callee12()
   ret ptr %r
 }
+
+define i8 @callee13() {
+; CHECK-LABEL: define i8 @callee13() {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @val8()
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %r = call i8 @val8()
+  ret i8 %r
+}
+
+define i8 @caller13_okay_use_after_poison_anyways() {
+; CHECK-LABEL: define i8 @caller13_okay_use_after_poison_anyways() {
+; CHECK-NEXT:    [[R_I:%.*]] = call range(i8 0, 10) i8 @val8()
+; CHECK-NEXT:    call void @use.val(i8 [[R_I]])
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call range(i8 0, 10) i8 @callee13()
+  call void @use.val(i8 %r)
+  ret i8 %r
+}
+
+define i8 @callee14() {
+; CHECK-LABEL: define i8 @callee14() {
+; CHECK-NEXT:    [[R:%.*]] = call noundef i8 @val8()
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %r = call noundef i8 @val8()
+  ret i8 %r
+}
+
+define i8 @caller14_fail_creates_ub() {
+; CHECK-LABEL: define i8 @caller14_fail_creates_ub() {
+; CHECK-NEXT:    [[R_I:%.*]] = call noundef i8 @val8()
+; CHECK-NEXT:    call void @use.val(i8 [[R_I]])
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call range(i8 0, 10) i8 @callee14()
+  call void @use.val(i8 %r)
+  ret i8 %r
+}
+
+define i8 @caller14_okay_is_ub_anyways() {
+; CHECK-LABEL: define i8 @caller14_okay_is_ub_anyways() {
+; CHECK-NEXT:    [[R_I:%.*]] = call noundef range(i8 0, 10) i8 @val8()
+; CHECK-NEXT:    call void @use.val(i8 [[R_I]])
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call noundef range(i8 0, 10) i8 @callee14()
+  call void @use.val(i8 %r)
+  ret i8 %r
+}
+
+define i8 @callee15() {
+; CHECK-LABEL: define i8 @callee15() {
+; CHECK-NEXT:    [[R:%.*]] = call range(i8 5, 10) i8 @val8()
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %r = call range(i8 5, 10) i8 @val8()
+  ret i8 %r
+}
+
+define i8 @caller15_okay_intersect_ranges() {
+; CHECK-LABEL: define i8 @caller15_okay_intersect_ranges() {
+; CHECK-NEXT:    [[R_I:%.*]] = call range(i8 5, 7) i8 @val8()
+; CHECK-NEXT:    call void @use.val(i8 [[R_I]])
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call range(i8 0, 7) i8 @callee15()
+  call void @use.val(i8 %r)
+  ret i8 %r
+}
diff --git a/llvm/test/Transforms/InstCombine/ashr-lshr.ll b/llvm/test/Transforms/InstCombine/ashr-lshr.ll
index ac206dc7999dd..c2a4f35412670 100644
--- a/llvm/test/Transforms/InstCombine/ashr-lshr.ll
+++ b/llvm/test/Transforms/InstCombine/ashr-lshr.ll
@@ -604,3 +604,262 @@ define <2 x i8> @ashr_known_pos_exact_vec(<2 x i8> %x, <2 x i8> %y) {
   %r = ashr exact <2 x i8> %p, %y
   ret <2 x i8> %r
 }
+
+define i32 @lshr_mul_times_3_div_2(i32 %0) {
+; CHECK-LABEL: @lshr_mul_times_3_div_2(
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 1
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nsw nuw i32 %0, 3
+  %lshr = lshr i32 %mul, 1
+  ret i32 %lshr
+}
+
+define i32 @lshr_mul_times_3_div_2_exact(i32 %x) {
+; CHECK-LABEL: @lshr_mul_times_3_div_2_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[LSHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nsw i32 %x, 3
+  %lshr = lshr exact i32 %mul, 1
+  ret i32 %lshr
+}
+
+; Negative test
+
+define i32 @lshr_mul_times_3_div_2_no_flags(i32 %0) {
+; CHECK-LABEL: @lshr_mul_times_3_div_2_no_flags(
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 3
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[MUL]], 1
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul i32 %0, 3
+  %lshr = lshr i32 %mul, 1
+  ret i32 %lshr
+}
+
+; Negative test
+
+define i32 @mul_times_3_div_2_multiuse_lshr(i32 %x) {
+; CHECK-LABEL: @mul_times_3_div_2_multiuse_lshr(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[RES:%.*]] = lshr i32 [[MUL]], 1
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %mul = mul nuw i32 %x, 3
+  %res = lshr i32 %mul, 1
+  call void @use(i32 %mul)
+  ret i32 %res
+}
+
+define i32 @lshr_mul_times_3_div_2_exact_2(i32 %x) {
+; CHECK-LABEL: @lshr_mul_times_3_div_2_exact_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nuw i32 %x, 3
+  %lshr = lshr exact i32 %mul, 1
+  ret i32 %lshr
+}
+
+define i32 @lshr_mul_times_5_div_4(i32 %0) {
+; CHECK-LABEL: @lshr_mul_times_5_div_4(
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 2
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nsw nuw i32 %0, 5
+  %lshr = lshr i32 %mul, 2
+  ret i32 %lshr
+}
+
+define i32 @lshr_mul_times_5_div_4_exact(i32 %x) {
+; CHECK-LABEL: @lshr_mul_times_5_div_4_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[LSHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nsw i32 %x, 5
+  %lshr = lshr exact i32 %mul, 2
+  ret i32 %lshr
+}
+
+; Negative test
+
+define i32 @lshr_mul_times_5_div_4_no_flags(i32 %0) {
+; CHECK-LABEL: @lshr_mul_times_5_div_4_no_flags(
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 5
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[MUL]], 2
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul i32 %0, 5
+  %lshr = lshr i32 %mul, 2
+  ret i32 %lshr
+}
+
+; Negative test
+
+define i32 @mul_times_5_div_4_multiuse_lshr(i32 %x) {
+; CHECK-LABEL: @mul_times_5_div_4_multiuse_lshr(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i32 [[X:%.*]], 5
+; CHECK-NEXT:    [[RES:%.*]] = lshr i32 [[MUL]], 2
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %mul = mul nuw i32 %x, 5
+  %res = lshr i32 %mul, 2
+  call void @use(i32 %mul)
+  ret i32 %res
+}
+
+define i32 @lshr_mul_times_5_div_4_exact_2(i32 %x) {
+; CHECK-LABEL: @lshr_mul_times_5_div_4_exact_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[LSHR:%.*]] = add nuw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul nuw i32 %x, 5
+  %lshr = lshr exact i32 %mul, 2
+  ret i32 %lshr
+}
+
+define i32 @ashr_mul_times_3_div_2(i32 %0) {
+; CHECK-LABEL: @ashr_mul_times_3_div_2(
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[TMP0:%.*]], 1
+; CHECK-NEXT:    [[ASHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nuw nsw i32 %0, 3
+  %ashr = ashr i32 %mul, 1
+  ret i32 %ashr
+}
+
+define i32 @ashr_mul_times_3_div_2_exact(i32 %x) {
+; CHECK-LABEL: @ashr_mul_times_3_div_2_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nsw i32 %x, 3
+  %ashr = ashr exact i32 %mul, 1
+  ret i32 %ashr
+}
+
+; Negative test
+
+define i32 @ashr_mul_times_3_div_2_no_flags(i32 %0) {
+; CHECK-LABEL: @ashr_mul_times_3_div_2_no_flags(
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 3
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i32 [[MUL]], 1
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul i32 %0, 3
+  %ashr = ashr i32 %mul, 1
+  ret i32 %ashr
+}
+
+; Negative test
+
+define i32 @ashr_mul_times_3_div_2_no_nsw(i32 %0) {
+; CHECK-LABEL: @ashr_mul_times_3_div_2_no_nsw(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i32 [[TMP0:%.*]], 3
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i32 [[MUL]], 1
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nuw i32 %0, 3
+  %ashr = ashr i32 %mul, 1
+  ret i32 %ashr
+}
+
+; Negative test
+
+define i32 @mul_times_3_div_2_multiuse_ashr(i32 %x) {
+; CHECK-LABEL: @mul_times_3_div_2_multiuse_ashr(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[RES:%.*]] = ashr i32 [[MUL]], 1
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %mul = mul nsw i32 %x, 3
+  %res = ashr i32 %mul, 1
+  call void @use(i32 %mul)
+  ret i32 %res
+}
+
+define i32 @ashr_mul_times_3_div_2_exact_2(i32 %x) {
+; CHECK-LABEL: @ashr_mul_times_3_div_2_exact_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nsw i32 %x, 3
+  %ashr = ashr exact i32 %mul, 1
+  ret i32 %ashr
+}
+
+define i32 @ashr_mul_times_5_div_4(i32 %0) {
+; CHECK-LABEL: @ashr_mul_times_5_div_4(
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[TMP0:%.*]], 2
+; CHECK-NEXT:    [[ASHR:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nuw nsw i32 %0, 5
+  %ashr = ashr i32 %mul, 2
+  ret i32 %ashr
+}
+
+define i32 @ashr_mul_times_5_div_4_exact(i32 %x) {
+; CHECK-LABEL: @ashr_mul_times_5_div_4_exact(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nsw i32 %x, 5
+  %ashr = ashr exact i32 %mul, 2
+  ret i32 %ashr
+}
+
+; Negative test
+
+define i32 @ashr_mul_times_5_div_4_no_flags(i32 %0) {
+; CHECK-LABEL: @ashr_mul_times_5_div_4_no_flags(
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP0:%.*]], 5
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i32 [[MUL]], 2
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul i32 %0, 5
+  %ashr = ashr i32 %mul, 2
+  ret i32 %ashr
+}
+
+; Negative test
+
+define i32 @mul_times_5_div_4_multiuse_ashr(i32 %x) {
+; CHECK-LABEL: @mul_times_5_div_4_multiuse_ashr(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[X:%.*]], 5
+; CHECK-NEXT:    [[RES:%.*]] = ashr i32 [[MUL]], 2
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %mul = mul nsw i32 %x, 5
+  %res = ashr i32 %mul, 2
+  call void @use(i32 %mul)
+  ret i32 %res
+}
+
+define i32 @ashr_mul_times_5_div_4_exact_2(i32 %x) {
+; CHECK-LABEL: @ashr_mul_times_5_div_4_exact_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[ASHR:%.*]] = add nsw i32 [[TMP1]], [[X]]
+; CHECK-NEXT:    ret i32 [[ASHR]]
+;
+  %mul = mul nsw i32 %x, 5
+  %ashr = ashr exact i32 %mul, 2
+  ret i32 %ashr
+}
+
+declare void @use(i32)
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll
index 88487b38e2c70..0a7de501ca022 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll
@@ -144,7 +144,7 @@ define i1 @oneuse0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = lshr i8 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = and i8 [[T0]], [[X:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[X_HIGHBITS]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
@@ -161,7 +161,8 @@ define i1 @oneuse1(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
@@ -195,7 +196,8 @@ define i1 @oneuse3(i8 %x, i8 %y) {
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
 ; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp uge i8 [[T1]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
@@ -269,9 +271,8 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) {
 define i1 @n1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
-; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 1, %y ; not -1
@@ -284,9 +285,9 @@ define i1 @n1(i8 %x, i8 %y) {
 define i1 @n2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], 1
+; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -2
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll
index b717925fd644f..54ff87676e71d 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll
@@ -144,7 +144,7 @@ define i1 @oneuse0(i8 %x, i8 %y) {
 ; CHECK-LABEL: @oneuse0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = lshr i8 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = and i8 [[T0]], [[X:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[X_HIGHBITS]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
@@ -161,7 +161,8 @@ define i1 @oneuse1(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
@@ -195,7 +196,8 @@ define i1 @oneuse3(i8 %x, i8 %y) {
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
 ; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
 ; CHECK-NEXT:    call void @use8(i8 [[T1]])
-; CHECK-NEXT:    [[RET:%.*]] = icmp ult i8 [[T1]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
@@ -269,9 +271,8 @@ define i1 @n0(i8 %x, i8 %y, i8 %notx) {
 define i1 @n1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -1
-; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[T0]], [[X:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 1, %y ; not -1
@@ -284,9 +285,9 @@ define i1 @n1(i8 %x, i8 %y) {
 define i1 @n2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
-; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], 1
+; CHECK-NEXT:    [[T1:%.*]] = xor i8 [[T0]], -2
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll
index a65be1e9ceeca..c7c57b601eab3 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-eq-to-icmp-ule.ll
@@ -251,9 +251,9 @@ define i1 @n1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0]], -1
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 0, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y ; not 1
@@ -268,9 +268,9 @@ define i1 @n2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = add nuw i8 [[T0]], 1
+; CHECK-NEXT:    [[T1:%.*]] = sub nuw i8 -2, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp eq i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 1, %y
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll
index f156d9bf007cb..d5826524f1637 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v3-and-icmp-ne-to-icmp-ugt.ll
@@ -251,9 +251,9 @@ define i1 @n1(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n1(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = add i8 [[T0]], -1
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 0, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 -1, %y ; not 1
@@ -268,9 +268,9 @@ define i1 @n2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @n2(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nuw i8 1, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = add nuw i8 [[T0]], 1
+; CHECK-NEXT:    [[T1:%.*]] = sub nuw i8 -2, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = and i8 [[T1]], [[X:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i8 [[T2]], 0
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %t0 = shl i8 1, %y
diff --git a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
index 5de3e89d7027a..8bb7fd0e522cb 100644
--- a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
@@ -5,9 +5,9 @@ declare void @use.i8(i8)
 declare void @use.i16(i16)
 define i1 @src_is_mask_zext(i16 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_zext(
-; CHECK-NEXT:    [[X:%.*]] = xor i16 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[M_IN:%.*]] = lshr i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = zext i8 [[M_IN]] to i16
+; CHECK-NEXT:    [[X:%.*]] = xor i16 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i16 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -22,11 +22,11 @@ define i1 @src_is_mask_zext(i16 %x_in, i8 %y) {
 
 define i1 @src_is_mask_zext_fail_not_mask(i16 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_zext_fail_not_mask(
-; CHECK-NEXT:    [[X:%.*]] = xor i16 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[M_IN:%.*]] = lshr i8 -2, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = zext i8 [[M_IN]] to i16
-; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X]], [[MASK]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i16 [[AND]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i16 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = or i16 [[TMP1]], [[MASK]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i16 [[TMP2]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i16 %x_in, 123
@@ -80,10 +80,10 @@ define i1 @src_is_mask_sext_fail_multiuse(i16 %x_in, i8 %y) {
 
 define i1 @src_is_mask_and(i8 %x_in, i8 %y, i8 %z) {
 ; CHECK-LABEL: @src_is_mask_and(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[MY:%.*]] = lshr i8 7, [[Y:%.*]]
 ; CHECK-NEXT:    [[MZ:%.*]] = lshr i8 -1, [[Z:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[MY]], [[MZ]]
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -99,12 +99,12 @@ define i1 @src_is_mask_and(i8 %x_in, i8 %y, i8 %z) {
 
 define i1 @src_is_mask_and_fail_mixed(i8 %x_in, i8 %y, i8 %z) {
 ; CHECK-LABEL: @src_is_mask_and_fail_mixed(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[MY:%.*]] = ashr i8 -8, [[Y:%.*]]
 ; CHECK-NEXT:    [[MZ:%.*]] = lshr i8 -1, [[Z:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[MY]], [[MZ]]
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[MASK]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[X]], [[AND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[MASK]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[TMP2]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
@@ -119,9 +119,9 @@ define i1 @src_is_mask_and_fail_mixed(i8 %x_in, i8 %y, i8 %z) {
 
 define i1 @src_is_mask_or(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_or(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[MY:%.*]] = lshr i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[MY]], 7
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -136,9 +136,9 @@ define i1 @src_is_mask_or(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_xor(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_xor(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[MASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -152,11 +152,11 @@ define i1 @src_is_mask_xor(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_xor_fail_notmask(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_xor_fail_notmask(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i8 0, [[Y:%.*]]
 ; CHECK-NEXT:    [[NOTMASK:%.*]] = xor i8 [[TMP1]], [[Y]]
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[NOTMASK]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i8 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[NOTMASK]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP3]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
@@ -170,10 +170,10 @@ define i1 @src_is_mask_xor_fail_notmask(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_select(i8 %x_in, i8 %y, i1 %cond) {
 ; CHECK-LABEL: @src_is_mask_select(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -245,11 +245,11 @@ define i1 @src_is_mask_shl_lshr_fail_not_allones(i8 %x_in, i8 %y, i1 %cond) {
 
 define i1 @src_is_mask_lshr(i8 %x_in, i8 %y, i8 %z, i1 %cond) {
 ; CHECK-LABEL: @src_is_mask_lshr(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[SMASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15
 ; CHECK-NEXT:    [[MASK:%.*]] = lshr i8 [[SMASK]], [[Z:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -265,11 +265,11 @@ define i1 @src_is_mask_lshr(i8 %x_in, i8 %y, i8 %z, i1 %cond) {
 
 define i1 @src_is_mask_ashr(i8 %x_in, i8 %y, i8 %z, i1 %cond) {
 ; CHECK-LABEL: @src_is_mask_ashr(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[SMASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15
 ; CHECK-NEXT:    [[MASK:%.*]] = ashr i8 [[SMASK]], [[Z:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -285,9 +285,9 @@ define i1 @src_is_mask_ashr(i8 %x_in, i8 %y, i8 %z, i1 %cond) {
 
 define i1 @src_is_mask_p2_m1(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_p2_m1(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[P2ORZ:%.*]] = shl i8 2, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = add i8 [[P2ORZ]], -1
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -301,10 +301,10 @@ define i1 @src_is_mask_p2_m1(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_umax(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_umax(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.umax.i8(i8 [[YMASK]], i8 3)
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -320,11 +320,11 @@ define i1 @src_is_mask_umax(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_umin(i8 %x_in, i8 %y, i8 %z) {
 ; CHECK-LABEL: @src_is_mask_umin(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[ZMASK:%.*]] = lshr i8 15, [[Z:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.umin.i8(i8 [[YMASK]], i8 [[ZMASK]])
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -341,12 +341,12 @@ define i1 @src_is_mask_umin(i8 %x_in, i8 %y, i8 %z) {
 
 define i1 @src_is_mask_umin_fail_mismatch(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_umin_fail_mismatch(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.umin.i8(i8 [[YMASK]], i8 -32)
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[MASK]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[MASK]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP2]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
@@ -361,10 +361,10 @@ define i1 @src_is_mask_umin_fail_mismatch(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_smax(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_smax(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.smax.i8(i8 [[YMASK]], i8 -1)
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -380,10 +380,10 @@ define i1 @src_is_mask_smax(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_smin(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_smin(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.smin.i8(i8 [[YMASK]], i8 0)
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -399,9 +399,9 @@ define i1 @src_is_mask_smin(i8 %x_in, i8 %y) {
 
 define i1 @src_is_mask_bitreverse_not_mask(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_mask_bitreverse_not_mask(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[NMASK:%.*]] = shl nsw i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[NMASK]])
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[MASK]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -417,7 +417,7 @@ define i1 @src_is_mask_bitreverse_not_mask(i8 %x_in, i8 %y) {
 define i1 @src_is_notmask_sext(i16 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_notmask_sext(
 ; CHECK-NEXT:    [[M_IN:%.*]] = shl i8 -8, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i16 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i16 [[X_IN:%.*]], -128
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[M_IN]] to i16
 ; CHECK-NEXT:    [[R:%.*]] = icmp uge i16 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -529,12 +529,11 @@ define i1 @src_is_notmask_lshr_shl(i8 %x_in, i8 %y) {
 
 define i1 @src_is_notmask_lshr_shl_fail_mismatch_shifts(i8 %x_in, i8 %y, i8 %z) {
 ; CHECK-LABEL: @src_is_notmask_lshr_shl_fail_mismatch_shifts(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[MASK_SHR:%.*]] = lshr i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[NMASK:%.*]] = shl i8 [[MASK_SHR]], [[Z:%.*]]
-; CHECK-NEXT:    [[MASK:%.*]] = xor i8 [[NMASK]], -1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[MASK]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123
+; CHECK-NEXT:    [[TMP2:%.*]] = and i8 [[TMP1]], [[NMASK]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
diff --git a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll
index e95c72b75f97d..0f26be12c39cc 100644
--- a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll
@@ -3,6 +3,7 @@
 
 declare i1 @barrier()
 declare void @llvm.assume(i1)
+declare void @use.i8(i8)
 
 define i1 @icmp_ult_x_y(i8 %x, i8 %y) {
 ; CHECK-LABEL: @icmp_ult_x_y(
@@ -238,9 +239,9 @@ define i1 @icmp_sle_negx_y_fail_maybe_zero(i8 %x, i8 %y) {
 
 define i1 @icmp_eq_x_invertable_y_todo(i8 %x, i1 %y) {
 ; CHECK-LABEL: @icmp_eq_x_invertable_y_todo(
-; CHECK-NEXT:    [[YY:%.*]] = select i1 [[Y:%.*]], i8 7, i8 24
+; CHECK-NEXT:    [[YY:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 -25
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY]], [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %yy = select i1 %y, i8 7, i8 24
@@ -251,22 +252,36 @@ define i1 @icmp_eq_x_invertable_y_todo(i8 %x, i1 %y) {
 
 define i1 @icmp_eq_x_invertable_y(i8 %x, i8 %y) {
 ; CHECK-LABEL: @icmp_eq_x_invertable_y(
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %yy = xor i8 %y, -1
+  %and = and i8 %x, %yy
+  %r = icmp eq i8 %x, %and
+  ret i1 %r
+}
+
+define i1 @icmp_eq_x_invertable_y_fail_multiuse(i8 %x, i8 %y) {
+; CHECK-LABEL: @icmp_eq_x_invertable_y_fail_multiuse(
 ; CHECK-NEXT:    [[YY:%.*]] = xor i8 [[Y:%.*]], -1
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY]], [[X:%.*]]
+; CHECK-NEXT:    call void @use.i8(i8 [[AND]])
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], [[X]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %yy = xor i8 %y, -1
   %and = and i8 %x, %yy
+  call void @use.i8(i8 %and)
   %r = icmp eq i8 %x, %and
   ret i1 %r
 }
 
 define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y) {
 ; CHECK-LABEL: @icmp_eq_x_invertable_y2_todo(
-; CHECK-NEXT:    [[YY:%.*]] = select i1 [[Y:%.*]], i8 7, i8 24
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY]], [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[YY]], [[AND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 -25
+; CHECK-NEXT:    [[TMP2:%.*]] = or i8 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[TMP2]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %yy = select i1 %y, i8 7, i8 24
@@ -277,9 +292,8 @@ define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y) {
 
 define i1 @icmp_eq_x_invertable_y2(i8 %x, i8 %y) {
 ; CHECK-LABEL: @icmp_eq_x_invertable_y2(
-; CHECK-NEXT:    [[YY:%.*]] = xor i8 [[Y:%.*]], -1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[YY]], [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[AND]], [[YY]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[TMP1]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %yy = xor i8 %y, -1
diff --git a/llvm/test/Transforms/InstCombine/lshr.ll b/llvm/test/Transforms/InstCombine/lshr.ll
index fa92c1c4b3be4..dfdb6c7b4b268 100644
--- a/llvm/test/Transforms/InstCombine/lshr.ll
+++ b/llvm/test/Transforms/InstCombine/lshr.ll
@@ -628,12 +628,12 @@ define i32 @mul_splat_fold_wrong_lshr_const(i32 %x) {
   ret i32 %t
 }
 
-; Negative test
+; Negative test (but simplifies into a different transform)
 
 define i32 @mul_splat_fold_no_nuw(i32 %x) {
 ; CHECK-LABEL: @mul_splat_fold_no_nuw(
-; CHECK-NEXT:    [[M:%.*]] = mul nsw i32 [[X:%.*]], 65537
-; CHECK-NEXT:    [[T:%.*]] = lshr i32 [[M]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 16
+; CHECK-NEXT:    [[T:%.*]] = add nsw i32 [[TMP1]], [[X]]
 ; CHECK-NEXT:    ret i32 [[T]]
 ;
   %m = mul nsw i32 %x, 65537
@@ -641,6 +641,19 @@ define i32 @mul_splat_fold_no_nuw(i32 %x) {
   ret i32 %t
 }
 
+; Negative test 
+
+define i32 @mul_splat_fold_no_flags(i32 %x) {
+; CHECK-LABEL: @mul_splat_fold_no_flags(
+; CHECK-NEXT:    [[M:%.*]] = mul i32 [[X:%.*]], 65537
+; CHECK-NEXT:    [[T:%.*]] = lshr i32 [[M]], 16
+; CHECK-NEXT:    ret i32 [[T]]
+;
+  %m = mul i32 %x, 65537
+  %t = lshr i32 %m, 16
+  ret i32 %t
+}
+
 ; Negative test (but simplifies before we reach the mul_splat transform)- need more than 2 bits
 
 define i2 @mul_splat_fold_too_narrow(i2 %x) {
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll b/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll
new file mode 100644
index 0000000000000..da56997f69382
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/logf128.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+; REQUIRES: has_logf128
+declare fp128 @llvm.log.f128(fp128)
+
+define fp128 @log_e_64(){
+; CHECK-LABEL: define fp128 @log_e_64() {
+; CHECK-NEXT:    ret fp128 0xL300000000000000040010A2B23F3BAB7
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000004005000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_smallest_positive_subnormal_number(){
+; CHECK-LABEL: define fp128 @log_e_smallest_positive_subnormal_number() {
+; CHECK-NEXT:    ret fp128 0xL3000000000000000C00C654628220780
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000010000000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_largest_subnormal_number(){
+; CHECK-LABEL: define fp128 @log_e_largest_subnormal_number() {
+; CHECK-NEXT:    ret fp128 0xLD000000000000000C00C62D918CE2421
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xLFFFFFFFFFFFFFFFF0000FFFFFFFFFFFF)
+  ret fp128 %A
+}
+
+define fp128 @log_e_smallest_positive_normal_number(){
+;
+; CHECK-LABEL: define fp128 @log_e_smallest_positive_normal_number() {
+; CHECK-NEXT:    ret fp128 0xLD000000000000000C00C62D918CE2421
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000000001000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_largest_normal_number(){
+; CHECK-LABEL: define fp128 @log_e_largest_normal_number() {
+; CHECK-NEXT:    ret fp128 0xLF000000000000000400C62E42FEFA39E
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xLFFFFFFFFFFFFFFFF7FFEFFFFFFFFFFFF)
+  ret fp128 %A
+}
+
+define fp128 @log_e_largest_number_less_than_one(){
+; CHECK-LABEL: define fp128 @log_e_largest_number_less_than_one() {
+; CHECK-NEXT:    ret fp128 0xL0000000000000000BF8E000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xLFFFFFFFFFFFFFFFF3FFEFFFFFFFFFFFF)
+  ret fp128 %A
+}
+
+define fp128 @log_e_1(){
+; CHECK-LABEL: define fp128 @log_e_1() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000000000000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000003FFF000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_smallest_number_larger_than_one(){
+; CHECK-LABEL: define fp128 @log_e_smallest_number_larger_than_one() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000003F8F000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000013FFF000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_negative_2(){
+; CHECK-LABEL: define fp128 @log_e_negative_2() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000007FFF800000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL0000000000000000C000000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_0(){
+; CHECK-LABEL: define fp128 @log_e_0() {
+; CHECK-NEXT:    ret fp128 0xL0000000000000000FFFF000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000000000000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_negative_0(){
+; CHECK-LABEL: define fp128 @log_e_negative_0() {
+; CHECK-NEXT:    ret fp128 0xL0000000000000000FFFF000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000008000000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_infinity(){
+; CHECK-LABEL: define fp128 @log_e_infinity() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000007FFF000000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000007FFF000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_negative_infinity(){
+; CHECK-LABEL: define fp128 @log_e_negative_infinity() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000007FFF800000000000
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL0000000000000000FFFF000000000000)
+  ret fp128 %A
+}
+
+define fp128 @log_e_nan(){
+; CHECK-LABEL: define fp128 @log_e_nan() {
+; CHECK-NEXT:    ret fp128 0xL00000000000000007FFF800000000001
+;
+  %A = call fp128 @llvm.log.f128(fp128 noundef 0xL00000000000000007FFF000000000001)
+  ret fp128 %A
+}
+
+define <2 x fp128> @log_e_negative_2_vector(){
+; CHECK-LABEL: define <2 x fp128> @log_e_negative_2_vector() {
+; CHECK-NEXT:    ret <2 x fp128> <fp128 0xL00000000000000007FFF800000000000, fp128 0xL00000000000000007FFF800000000000>
+;
+  %A = call <2 x fp128> @llvm.log.v2f128(<2 x fp128> <fp128 0xL0000000000000000C000000000000000, fp128 0xL0000000000000000C000000000000001>)
+  ret <2 x fp128> %A
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll
index 5f55450120620..00ee7f8a92b21 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vectorgep-crash.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=instcombine -S -o - %s | FileCheck %s
 ; Tests that we don't crash upon encountering a vector GEP
 
@@ -23,17 +24,21 @@ top:
 %struct.C = type { i64 }
 
 @G = internal global [65 x %struct.A] zeroinitializer, align 16
-; CHECK-LABEL: @test
-; CHECK: ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>, i32 0)
 define <16 x ptr> @test() {
+; CHECK-LABEL: define <16 x ptr> @test() {
+; CHECK-NEXT:  [[VECTOR_BODY:.*:]]
+; CHECK-NEXT:    ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>, i32 0)
+;
 vector.body:
   %VectorGep = getelementptr [65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>, <16 x i32> zeroinitializer
   ret <16 x ptr> %VectorGep
 }
 
-; CHECK-LABEL: @test2
-; CHECK: ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9,
 define <16 x ptr> @test2() {
+; CHECK-LABEL: define <16 x ptr> @test2() {
+; CHECK-NEXT:  [[VECTOR_BODY:.*:]]
+; CHECK-NEXT:    ret <16 x ptr> getelementptr ([65 x %struct.A], ptr @G, <16 x i64> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>, i32 0)
+;
 vector.body:
   %VectorGep = getelementptr [65 x %struct.A], ptr @G, <16 x i32> zeroinitializer, <16 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>, <16 x i32> zeroinitializer
   ret <16 x ptr> %VectorGep
@@ -42,7 +47,7 @@ vector.body:
 @g = external global i8, align 1
 
 define <2 x ptr> @constant_zero_index() {
-; CHECK-LABEL: @constant_zero_index(
+; CHECK-LABEL: define <2 x ptr> @constant_zero_index() {
 ; CHECK-NEXT:    ret <2 x ptr> <ptr @g, ptr @g>
 ;
   %gep = getelementptr i8, ptr @g, <2 x i64> zeroinitializer
@@ -50,7 +55,7 @@ define <2 x ptr> @constant_zero_index() {
 }
 
 define <2 x ptr> @constant_undef_index() {
-; CHECK-LABEL: @constant_undef_index(
+; CHECK-LABEL: define <2 x ptr> @constant_undef_index() {
 ; CHECK-NEXT:    ret <2 x ptr> <ptr @g, ptr @g>
 ;
   %gep = getelementptr i8, ptr @g, <2 x i64> undef
@@ -58,7 +63,7 @@ define <2 x ptr> @constant_undef_index() {
 }
 
 define <2 x ptr> @constant_inbounds() {
-; CHECK-LABEL: @constant_inbounds(
+; CHECK-LABEL: define <2 x ptr> @constant_inbounds() {
 ; CHECK-NEXT:    ret <2 x ptr> getelementptr inbounds (i8, ptr @g, <2 x i64> <i64 1, i64 1>)
 ;
   %gep = getelementptr i8, ptr @g, <2 x i64> <i64 1, i64 1>
diff --git a/llvm/test/Transforms/InstSimplify/vector_gep.ll b/llvm/test/Transforms/InstSimplify/vector_gep.ll
index ba0d978ed5b3c..79aa9f13d1ea7 100644
--- a/llvm/test/Transforms/InstSimplify/vector_gep.ll
+++ b/llvm/test/Transforms/InstSimplify/vector_gep.ll
@@ -1,105 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 declare void @helper(<2 x ptr>)
 define void @test(<2 x ptr> %a) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: <2 x ptr> [[A:%.*]]) {
+; CHECK-NEXT:    call void @helper(<2 x ptr> [[A]])
+; CHECK-NEXT:    ret void
+;
   %A = getelementptr i8, <2 x ptr> %a, <2 x i32> <i32 0, i32 0>
   call void @helper(<2 x ptr> %A)
   ret void
 }
 
 define <4 x ptr> @test1(<4 x ptr> %a) {
+; CHECK-LABEL: define <4 x ptr> @test1(
+; CHECK-SAME: <4 x ptr> [[A:%.*]]) {
+; CHECK-NEXT:    ret <4 x ptr> [[A]]
+;
   %gep = getelementptr i8, <4 x ptr> %a, <4 x i32> zeroinitializer
   ret <4 x ptr> %gep
-
-; CHECK-LABEL: @test1
-; CHECK-NEXT: ret <4 x ptr> %a
 }
 
 define <4 x ptr> @test2(<4 x ptr> %a) {
+; CHECK-LABEL: define <4 x ptr> @test2(
+; CHECK-SAME: <4 x ptr> [[A:%.*]]) {
+; CHECK-NEXT:    ret <4 x ptr> [[A]]
+;
   %gep = getelementptr i8, <4 x ptr> %a
   ret <4 x ptr> %gep
-
-; CHECK-LABEL: @test2
-; CHECK-NEXT: ret <4 x ptr> %a
 }
 
 %struct = type { double, float }
 
 define <4 x ptr> @test3() {
+; CHECK-LABEL: define <4 x ptr> @test3() {
+; CHECK-NEXT:    ret <4 x ptr> undef
+;
   %gep = getelementptr %struct, <4 x ptr> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x ptr> %gep
-
-; CHECK-LABEL: @test3
-; CHECK-NEXT: ret <4 x ptr> undef
 }
 
 %struct.empty = type { }
 
 define <4 x ptr> @test4(<4 x ptr> %a) {
+; CHECK-LABEL: define <4 x ptr> @test4(
+; CHECK-SAME: <4 x ptr> [[A:%.*]]) {
+; CHECK-NEXT:    ret <4 x ptr> [[A]]
+;
   %gep = getelementptr %struct.empty, <4 x ptr> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x ptr> %gep
-
-; CHECK-LABEL: @test4
-; CHECK-NEXT: ret <4 x ptr> %a
 }
 
 define <4 x ptr> @test5() {
+; CHECK-LABEL: define <4 x ptr> @test5() {
+; CHECK-NEXT:    ret <4 x ptr> getelementptr (i8, <4 x ptr> <ptr inttoptr (i64 1 to ptr), ptr inttoptr (i64 2 to ptr), ptr inttoptr (i64 3 to ptr), ptr inttoptr (i64 4 to ptr)>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>)
+;
   %c = inttoptr <4 x i64> <i64 1, i64 2, i64 3, i64 4> to <4 x ptr>
   %gep = getelementptr i8, <4 x ptr> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   ret <4 x ptr> %gep
-
-; CHECK-LABEL: @test5
-; CHECK-NEXT: ret <4 x ptr> getelementptr (i8, <4 x ptr> <ptr inttoptr (i64 1 to ptr), ptr inttoptr (i64 2 to ptr), ptr inttoptr (i64 3 to ptr), ptr inttoptr (i64 4 to ptr)>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>)
 }
 
 @v = global [24 x [42 x [3 x i32]]] zeroinitializer, align 16
 
 define <16 x ptr> @test6() {
-; CHECK-LABEL: @test6
-; CHECK-NEXT: ret <16 x ptr> getelementptr inbounds ([24 x [42 x [3 x i32]]], ptr @v, <16 x i64> zeroinitializer, <16 x i64> zeroinitializer, <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, <16 x i64> zeroinitializer)
+; CHECK-LABEL: define <16 x ptr> @test6() {
+; CHECK-NEXT:    ret <16 x ptr> getelementptr inbounds ([24 x [42 x [3 x i32]]], ptr @v, <16 x i64> zeroinitializer, <16 x i64> zeroinitializer, <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, <16 x i64> zeroinitializer)
+;
   %VectorGep = getelementptr [24 x [42 x [3 x i32]]], ptr @v, i64 0, i64 0, <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, i64 0
   ret <16 x ptr> %VectorGep
 }
 
 ; PR32697
-; CHECK-LABEL: tinkywinky(
-; CHECK-NEXT: ret <4 x ptr> undef
 define <4 x ptr> @tinkywinky() {
+; CHECK-LABEL: define <4 x ptr> @tinkywinky() {
+; CHECK-NEXT:    ret <4 x ptr> undef
+;
   %patatino = getelementptr i8, ptr undef, <4 x i64> undef
   ret <4 x ptr> %patatino
 }
 
 ; PR32697
-; CHECK-LABEL: dipsy(
-; CHECK-NEXT: ret <4 x ptr> undef
 define <4 x ptr> @dipsy() {
+; CHECK-LABEL: define <4 x ptr> @dipsy() {
+; CHECK-NEXT:    ret <4 x ptr> undef
+;
   %patatino = getelementptr i8, <4 x ptr> undef, <4 x i64> undef
   ret <4 x ptr> %patatino
 }
 
 ; PR32697
-; CHECK-LABEL: laalaa(
-; CHECK-NEXT: ret <4 x ptr> undef
 define <4 x ptr> @laalaa() {
+; CHECK-LABEL: define <4 x ptr> @laalaa() {
+; CHECK-NEXT:    ret <4 x ptr> undef
+;
   %patatino = getelementptr i8, <4 x ptr> undef, i64 undef
   ret <4 x ptr> %patatino
 }
 
 define <2 x ptr> @zero_index(ptr %p) {
-; CHECK-LABEL: @zero_index(
-; CHECK-NEXT:    %gep = getelementptr i8, ptr %p, <2 x i64> zeroinitializer
-; CHECK-NEXT:    ret <2 x ptr> %gep
+; CHECK-LABEL: define <2 x ptr> @zero_index(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[P]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    ret <2 x ptr> [[GEP]]
 ;
   %gep = getelementptr i8, ptr %p, <2 x i64> zeroinitializer
   ret <2 x ptr> %gep
 }
 
 define <2 x ptr> @unsized(ptr %p) {
-; CHECK-LABEL: @unsized(
-; CHECK-NEXT:    %gep = getelementptr {}, ptr %p, <2 x i64> undef
-; CHECK-NEXT:    ret <2 x ptr> %gep
+; CHECK-LABEL: define <2 x ptr> @unsized(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr {}, ptr [[P]], <2 x i64> undef
+; CHECK-NEXT:    ret <2 x ptr> [[GEP]]
 ;
   %gep = getelementptr {}, ptr %p, <2 x i64> undef
   ret <2 x ptr> %gep
diff --git a/llvm/test/Transforms/LICM/update-scev-after-hoist.ll b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll
new file mode 100644
index 0000000000000..fc45b8fce1766
--- /dev/null
+++ b/llvm/test/Transforms/LICM/update-scev-after-hoist.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -passes='loop-unroll,loop-mssa(licm),print<scalar-evolution>' -unroll-count=4 -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCEV-EXPR
+
+define i16 @main() {
+; SCEV-EXPR:      Classifying expressions for: @main
+; SCEV-EXPR-NEXT:  %mul = phi i16 [ 1, %entry ], [ %mul.n.3, %loop ]
+; SCEV-EXPR-NEXT:  -->  %mul U: [0,-15) S: [-32768,32753)		Exits: 4096		LoopDispositions: { %loop: Variant }
+; SCEV-EXPR-NEXT:  %div = phi i16 [ 32767, %entry ], [ %div.n.3, %loop ]
+; SCEV-EXPR-NEXT:  -->  %div U: [-2048,-32768) S: [-2048,-32768)		Exits: 7		LoopDispositions: { %loop: Variant }
+; SCEV-EXPR-NEXT:  %mul.n.reass.reass = mul i16 %mul, 8
+; SCEV-EXPR-NEXT:  -->  (8 * %mul) U: [0,-7) S: [-32768,32761)		Exits: -32768		LoopDispositions: { %loop: Variant }
+entry:
+  br label %loop
+
+loop:
+  %mul = phi i16 [ 1, %entry ], [ %mul.n, %loop ]
+  %div = phi i16 [ 32767, %entry ], [ %div.n, %loop ]
+  %mul.n = mul i16 %mul, 2
+  %div.n = sdiv i16 %div, 2
+  %cmp = icmp sgt i16 %div, 0
+  br i1 %cmp, label %loop, label %end
+
+end:
+  ret i16 %mul
+}
diff --git a/llvm/test/Transforms/Reassociate/local-cse.ll b/llvm/test/Transforms/Reassociate/local-cse.ll
index 4d0467e263f55..d0d609f022b46 100644
--- a/llvm/test/Transforms/Reassociate/local-cse.ll
+++ b/llvm/test/Transforms/Reassociate/local-cse.ll
@@ -26,16 +26,16 @@ define void @chain_spanning_several_blocks(i64 %inv1, i64 %inv2, i64 %inv3, i64
 ; LOCAL_CSE-LABEL: define void @chain_spanning_several_blocks
 ; LOCAL_CSE-SAME: (i64 [[INV1:%.*]], i64 [[INV2:%.*]], i64 [[INV3:%.*]], i64 [[INV4:%.*]], i64 [[INV5:%.*]]) {
 ; LOCAL_CSE-NEXT:  bb1:
-; LOCAL_CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw i64 [[INV2]], [[INV1]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw nsw i64 [[INV2]], [[INV1]]
 ; LOCAL_CSE-NEXT:    br label [[BB2:%.*]]
 ; LOCAL_CSE:       bb2:
 ; LOCAL_CSE-NEXT:    [[VAL_BB2:%.*]] = call i64 @get_val()
-; LOCAL_CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV4]]
-; LOCAL_CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw i64 [[CHAIN_A1]], [[VAL_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_B1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV5]]
-; LOCAL_CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw i64 [[CHAIN_B1]], [[VAL_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_C0:%.*]] = add nuw i64 [[INV3]], [[INV1]]
-; LOCAL_CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_C0]], [[VAL_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV4]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[VAL_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_B1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV5]]
+; LOCAL_CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_B1]], [[VAL_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_C0:%.*]] = add nuw nsw i64 [[INV3]], [[INV1]]
+; LOCAL_CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_C0]], [[VAL_BB2]]
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_A2]])
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_B2]])
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_C1]])
@@ -47,11 +47,11 @@ define void @chain_spanning_several_blocks(i64 %inv1, i64 %inv2, i64 %inv3, i64
 ; CSE-NEXT:    br label [[BB2:%.*]]
 ; CSE:       bb2:
 ; CSE-NEXT:    [[VAL_BB2:%.*]] = call i64 @get_val()
-; CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw i64 [[VAL_BB2]], [[INV1]]
-; CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV2]]
+; CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw nsw i64 [[VAL_BB2]], [[INV1]]
+; CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV2]]
 ; CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV4]]
 ; CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV5]]
-; CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV3]]
+; CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV3]]
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_A2]])
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_B2]])
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_C1]])
@@ -90,19 +90,19 @@ define void @chain_spanning_several_blocks_no_entry_anchor() {
 ; LOCAL_CSE-NEXT:    br label [[BB1:%.*]]
 ; LOCAL_CSE:       bb1:
 ; LOCAL_CSE-NEXT:    [[INV1_BB1:%.*]] = call i64 @get_val()
-; LOCAL_CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw i64 [[INV1_BB1]], [[INV2_BB0]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw nsw i64 [[INV1_BB1]], [[INV2_BB0]]
 ; LOCAL_CSE-NEXT:    br label [[BB2:%.*]]
 ; LOCAL_CSE:       bb2:
 ; LOCAL_CSE-NEXT:    [[INV3_BB2:%.*]] = call i64 @get_val()
 ; LOCAL_CSE-NEXT:    [[INV4_BB2:%.*]] = call i64 @get_val()
 ; LOCAL_CSE-NEXT:    [[INV5_BB2:%.*]] = call i64 @get_val()
 ; LOCAL_CSE-NEXT:    [[VAL_BB2:%.*]] = call i64 @get_val()
-; LOCAL_CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV4_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw i64 [[CHAIN_A1]], [[VAL_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_B1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV5_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw i64 [[CHAIN_B1]], [[VAL_BB2]]
-; LOCAL_CSE-NEXT:    [[CHAIN_C0:%.*]] = add nuw i64 [[VAL_BB2]], [[INV1_BB1]]
-; LOCAL_CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_C0]], [[INV3_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV4_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[VAL_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_B1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV5_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_B1]], [[VAL_BB2]]
+; LOCAL_CSE-NEXT:    [[CHAIN_C0:%.*]] = add nuw nsw i64 [[VAL_BB2]], [[INV1_BB1]]
+; LOCAL_CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_C0]], [[INV3_BB2]]
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_A2]])
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_B2]])
 ; LOCAL_CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_C1]])
@@ -120,11 +120,11 @@ define void @chain_spanning_several_blocks_no_entry_anchor() {
 ; CSE-NEXT:    [[INV4_BB2:%.*]] = call i64 @get_val()
 ; CSE-NEXT:    [[INV5_BB2:%.*]] = call i64 @get_val()
 ; CSE-NEXT:    [[VAL_BB2:%.*]] = call i64 @get_val()
-; CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw i64 [[VAL_BB2]], [[INV1_BB1]]
-; CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV2_BB0]]
+; CSE-NEXT:    [[CHAIN_A0:%.*]] = add nuw nsw i64 [[VAL_BB2]], [[INV1_BB1]]
+; CSE-NEXT:    [[CHAIN_A1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV2_BB0]]
 ; CSE-NEXT:    [[CHAIN_A2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV4_BB2]]
 ; CSE-NEXT:    [[CHAIN_B2:%.*]] = add nuw nsw i64 [[CHAIN_A1]], [[INV5_BB2]]
-; CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw i64 [[CHAIN_A0]], [[INV3_BB2]]
+; CSE-NEXT:    [[CHAIN_C1:%.*]] = add nuw nsw i64 [[CHAIN_A0]], [[INV3_BB2]]
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_A2]])
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_B2]])
 ; CSE-NEXT:    call void @keep_alive(i64 [[CHAIN_C1]])
diff --git a/llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll b/llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll
new file mode 100644
index 0000000000000..fcebc4980e6d7
--- /dev/null
+++ b/llvm/test/Transforms/Reassociate/reassoc-add-nsw.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=reassociate -S | FileCheck %s
+define i32 @nsw_preserve_nonnegative(ptr %ptr0, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: define i32 @nsw_preserve_nonnegative(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[PTR0]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[PTR1]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[PTR2]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[ADD0:%.*]] = add nsw i32 [[V1]], [[V0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[ADD0]], [[V2]]
+; CHECK-NEXT:    ret i32 [[ADD1]]
+;
+  %v0 = load i32, ptr %ptr0, !range !1
+  %v1 = load i32, ptr %ptr1, !range !1
+  %v2 = load i32, ptr %ptr2, !range !1
+  %add0 = add nsw i32 %v1, %v2
+  %add1 = add nsw i32 %add0, %v0
+  ret i32 %add1
+}
+
+define i32 @nsw_preserve_nuw_nsw(ptr %ptr0, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: define i32 @nsw_preserve_nuw_nsw(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[PTR0]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[PTR1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[PTR2]], align 4
+; CHECK-NEXT:    [[ADD0:%.*]] = add nuw nsw i32 [[V1]], [[V0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i32 [[ADD0]], [[V2]]
+; CHECK-NEXT:    ret i32 [[ADD1]]
+;
+  %v0 = load i32, ptr %ptr0
+  %v1 = load i32, ptr %ptr1
+  %v2 = load i32, ptr %ptr2
+  %add0 = add nuw nsw i32 %v1, %v2
+  %add1 = add nuw nsw i32 %add0, %v0
+  ret i32 %add1
+}
+
+define i32 @nsw_dont_preserve_negative(ptr %ptr0, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: define i32 @nsw_dont_preserve_negative(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[PTR0]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[PTR1]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[PTR2]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[ADD0:%.*]] = add i32 [[V1]], [[V0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ADD0]], [[V2]]
+; CHECK-NEXT:    ret i32 [[ADD1]]
+;
+  %v0 = load i32, ptr %ptr0
+  %v1 = load i32, ptr %ptr1, !range !1
+  %v2 = load i32, ptr %ptr2, !range !1
+  %add0 = add nsw i32 %v1, %v2
+  %add1 = add nsw i32 %add0, %v0
+  ret i32 %add1
+}
+
+define i32 @nsw_nopreserve_notallnsw(ptr %ptr0, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: define i32 @nsw_nopreserve_notallnsw(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], ptr [[PTR2:%.*]]) {
+; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[PTR0]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[PTR1]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[PTR2]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    [[ADD0:%.*]] = add i32 [[V1]], [[V0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ADD0]], [[V2]]
+; CHECK-NEXT:    ret i32 [[ADD1]]
+;
+  %v0 = load i32, ptr %ptr0, !range !1
+  %v1 = load i32, ptr %ptr1, !range !1
+  %v2 = load i32, ptr %ptr2, !range !1
+  %add0 = add nsw i32 %v1, %v2
+  %add1 = add i32 %add0, %v0
+  ret i32 %add1
+}
+
+; Positive 32 bit integers
+!1 = !{i32 0, i32 2147483648}
+;.
+; CHECK: [[RNG0]] = !{i32 0, i32 -2147483648}
+;.
diff --git a/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll b/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll
index fcedde23ecc7f..bd0060cc5abbd 100644
--- a/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll
+++ b/llvm/test/Transforms/Reassociate/reassoc_bool_vec.ll
@@ -57,13 +57,12 @@ define <8 x i1> @vector2(<8 x i1> %a, <8 x i1> %b0, <8 x i1> %b1, <8 x i1> %b2,
 ; CHECK-NEXT:    [[OR6:%.*]] = or <8 x i1> [[B6]], [[A]]
 ; CHECK-NEXT:    [[OR7:%.*]] = or <8 x i1> [[B7]], [[A]]
 ; CHECK-NEXT:    [[XOR0:%.*]] = xor <8 x i1> [[OR1]], [[OR0]]
-; CHECK-NEXT:    [[XOR1:%.*]] = xor <8 x i1> [[XOR0]], [[OR2]]
-; CHECK-NEXT:    [[XOR2:%.*]] = xor <8 x i1> [[XOR1]], [[OR3]]
-; CHECK-NEXT:    [[XOR3:%.*]] = xor <8 x i1> [[XOR2]], [[OR4]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor <8 x i1> [[XOR0]], [[OR2]]
+; CHECK-NEXT:    [[OR045:%.*]] = xor <8 x i1> [[XOR2]], [[OR3]]
+; CHECK-NEXT:    [[XOR3:%.*]] = xor <8 x i1> [[OR045]], [[OR4]]
 ; CHECK-NEXT:    [[XOR4:%.*]] = xor <8 x i1> [[XOR3]], [[OR5]]
 ; CHECK-NEXT:    [[XOR5:%.*]] = xor <8 x i1> [[XOR4]], [[OR6]]
 ; CHECK-NEXT:    [[XOR6:%.*]] = xor <8 x i1> [[XOR5]], [[OR7]]
-; CHECK-NEXT:    [[OR045:%.*]] = or <8 x i1> [[XOR1]], [[XOR0]]
 ; CHECK-NEXT:    [[OR4560:%.*]] = or <8 x i1> [[OR045]], [[XOR2]]
 ; CHECK-NEXT:    [[OR023:%.*]] = or <8 x i1> [[OR4560]], [[XOR3]]
 ; CHECK-NEXT:    [[OR001:%.*]] = or <8 x i1> [[OR023]], [[XOR4]]
diff --git a/llvm/test/Transforms/Reassociate/repeats.ll b/llvm/test/Transforms/Reassociate/repeats.ll
index c18db19fa73e3..28177f1c0ba5e 100644
--- a/llvm/test/Transforms/Reassociate/repeats.ll
+++ b/llvm/test/Transforms/Reassociate/repeats.ll
@@ -1,56 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=reassociate -S | FileCheck %s
 
 ; Tests involving repeated operations on the same value.
 
 define i8 @nilpotent(i8 %x) {
-; CHECK-LABEL: @nilpotent(
+; CHECK-LABEL: define i8 @nilpotent(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    ret i8 0
+;
   %tmp = xor i8 %x, %x
   ret i8 %tmp
-; CHECK: ret i8 0
 }
 
 define i2 @idempotent(i2 %x) {
-; CHECK-LABEL: @idempotent(
+; CHECK-LABEL: define i2 @idempotent(
+; CHECK-SAME: i2 [[X:%.*]]) {
+; CHECK-NEXT:    ret i2 -1
+;
   %tmp1 = and i2 %x, %x
   %tmp2 = and i2 %tmp1, %x
   %tmp3 = and i2 %tmp2, %x
   ret i2 %tmp3
-; CHECK: ret i2 %x
 }
 
 define i2 @add(i2 %x) {
-; CHECK-LABEL: @add(
+; CHECK-LABEL: define i2 @add(
+; CHECK-SAME: i2 [[X:%.*]]) {
+; CHECK-NEXT:    ret i2 0
+;
   %tmp1 = add i2 %x, %x
   %tmp2 = add i2 %tmp1, %x
   %tmp3 = add i2 %tmp2, %x
   ret i2 %tmp3
-; CHECK: ret i2 0
 }
 
 define i2 @cst_add() {
-; CHECK-LABEL: @cst_add(
+; CHECK-LABEL: define i2 @cst_add() {
+; CHECK-NEXT:    ret i2 -1
+;
   %tmp1 = add i2 1, 1
   %tmp2 = add i2 %tmp1, 1
   ret i2 %tmp2
-; CHECK: ret i2 -1
 }
 
 define i8 @cst_mul() {
-; CHECK-LABEL: @cst_mul(
+; CHECK-LABEL: define i8 @cst_mul() {
+; CHECK-NEXT:    ret i8 -13
+;
   %tmp1 = mul i8 3, 3
   %tmp2 = mul i8 %tmp1, 3
   %tmp3 = mul i8 %tmp2, 3
   %tmp4 = mul i8 %tmp3, 3
   ret i8 %tmp4
-; CHECK: ret i8 -13
 }
 
 define i3 @foo3x5(i3 %x) {
 ; Can be done with two multiplies.
-; CHECK-LABEL: @foo3x5(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i3 @foo3x5(
+; CHECK-SAME: i3 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i3 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i3 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret i3 [[TMP5]]
+;
   %tmp1 = mul i3 %x, %x
   %tmp2 = mul i3 %tmp1, %x
   %tmp3 = mul i3 %tmp2, %x
@@ -58,12 +70,31 @@ define i3 @foo3x5(i3 %x) {
   ret i3 %tmp4
 }
 
+define i3 @foo3x5_nsw(i3 %x) {
+; Can be done with two multiplies.
+; CHECK-LABEL: define i3 @foo3x5_nsw(
+; CHECK-SAME: i3 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i3 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i3 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret i3 [[TMP4]]
+;
+  %tmp1 = mul i3 %x, %x
+  %tmp2 = mul i3 %tmp1, %x
+  %tmp3 = mul i3 %tmp2, %x
+  %tmp4 = mul nsw i3 %tmp3, %x
+  ret i3 %tmp4
+}
+
 define i3 @foo3x6(i3 %x) {
 ; Can be done with two multiplies.
-; CHECK-LABEL: @foo3x6(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i3 @foo3x6(
+; CHECK-SAME: i3 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i3 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i3 [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    ret i3 [[TMP2]]
+;
   %tmp1 = mul i3 %x, %x
   %tmp2 = mul i3 %tmp1, %x
   %tmp3 = mul i3 %tmp2, %x
@@ -74,10 +105,14 @@ define i3 @foo3x6(i3 %x) {
 
 define i3 @foo3x7(i3 %x) {
 ; Can be done with two multiplies.
-; CHECK-LABEL: @foo3x7(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i3 @foo3x7(
+; CHECK-SAME: i3 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i3 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i3 [[TMP5]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i3 [[TMP7]], [[X]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i3 [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    ret i3 [[TMP6]]
+;
   %tmp1 = mul i3 %x, %x
   %tmp2 = mul i3 %tmp1, %x
   %tmp3 = mul i3 %tmp2, %x
@@ -89,10 +124,13 @@ define i3 @foo3x7(i3 %x) {
 
 define i4 @foo4x8(i4 %x) {
 ; Can be done with two multiplies.
-; CHECK-LABEL: @foo4x8(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x8(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    ret i4 [[TMP4]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -105,11 +143,14 @@ define i4 @foo4x8(i4 %x) {
 
 define i4 @foo4x9(i4 %x) {
 ; Can be done with three multiplies.
-; CHECK-LABEL: @foo4x9(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x9(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[X]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i4 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    ret i4 [[TMP8]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -123,11 +164,14 @@ define i4 @foo4x9(i4 %x) {
 
 define i4 @foo4x10(i4 %x) {
 ; Can be done with three multiplies.
-; CHECK-LABEL: @foo4x10(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x10(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP4]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    ret i4 [[TMP3]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -142,12 +186,15 @@ define i4 @foo4x10(i4 %x) {
 
 define i4 @foo4x11(i4 %x) {
 ; Can be done with four multiplies.
-; CHECK-LABEL: @foo4x11(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x11(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP4]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[X]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i4 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    ret i4 [[TMP10]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -163,10 +210,14 @@ define i4 @foo4x11(i4 %x) {
 
 define i4 @foo4x12(i4 %x) {
 ; Can be done with two multiplies.
-; CHECK-LABEL: @foo4x12(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x12(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    ret i4 [[TMP2]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -183,11 +234,15 @@ define i4 @foo4x12(i4 %x) {
 
 define i4 @foo4x13(i4 %x) {
 ; Can be done with three multiplies.
-; CHECK-LABEL: @foo4x13(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x13(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i4 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret i4 [[TMP12]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -205,11 +260,15 @@ define i4 @foo4x13(i4 %x) {
 
 define i4 @foo4x14(i4 %x) {
 ; Can be done with three multiplies.
-; CHECK-LABEL: @foo4x14(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x14(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i4 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i4 [[TMP5]], [[X]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i4 [[TMP6]], [[TMP6]]
+; CHECK-NEXT:    ret i4 [[TMP7]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
@@ -228,12 +287,16 @@ define i4 @foo4x14(i4 %x) {
 
 define i4 @foo4x15(i4 %x) {
 ; Can be done with four multiplies.
-; CHECK-LABEL: @foo4x15(
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: mul
-; CHECK-NEXT: ret
+; CHECK-LABEL: define i4 @foo4x15(
+; CHECK-SAME: i4 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i4 [[X]], [[X]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i4 [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i4 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i4 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i4 [[TMP6]], [[X]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i4 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i4 [[TMP14]]
+;
   %tmp1 = mul i4 %x, %x
   %tmp2 = mul i4 %tmp1, %x
   %tmp3 = mul i4 %tmp2, %x
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof
new file mode 100644
index 0000000000000..d1c0408210f49
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-hot-callsite-threshold.prof
@@ -0,0 +1,3 @@
+foo:100:100
+ 1: bar:100
+  1:100
diff --git a/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll b/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll
new file mode 100644
index 0000000000000..914ab4f1e3da5
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/inline-hot-callsite-threshold.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-hot-callsite-threshold.prof -S -pass-remarks=sample-profile -sample-profile-hot-inline-threshold=100 2>&1 | FileCheck %s
+
+; CHECK: remark: a.cc:6:12: 'bar' inlined into 'foo' to match profiling context with (cost={{.*}}, threshold=100)
+; CHECK:     define dso_local noundef i32 @foo(i32 noundef %0)
+; CHECK-NOT:   %2 = tail call noundef i32 @bar(i32 noundef %0)
+; CHECK-NEXT:  %2 = icmp sgt i32 %0, 1
+; CHECK-NEXT:  br i1 %2, label %3, label %bar.exit
+
+; Manually lower cost threshold for hot function inlining, so that the function
+; is not inlined even profile indicates it as hot.
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-hot-callsite-threshold.prof -S -pass-remarks=sample-profile -sample-profile-hot-inline-threshold=1 2>&1 | FileCheck %s --check-prefix=COST
+
+; COST-NOT:  remark
+; COST: define dso_local noundef i32 @foo(i32 noundef %0)
+; COST-NEXT: %2 = tail call noundef i32 @bar(i32 noundef %0)
+
+define dso_local noundef i32 @bar(i32 noundef %0) #0 !dbg !10 {
+  %2 = icmp sgt i32 %0, 1
+  br i1 %2, label %3, label %15
+3:                                                ; preds = %1
+  %4 = add nsw i32 %0, -2
+  %5 = mul i32 %4, %4
+  %6 = add i32 %5, %0
+  %7 = zext nneg i32 %4 to i33
+  %8 = add nsw i32 %0, -3
+  %9 = zext i32 %8 to i33
+  %10 = mul i33 %7, %9
+  %11 = lshr i33 %10, 1
+  %12 = trunc nuw i33 %11 to i32
+  %13 = xor i32 %12, -1
+  %14 = add i32 %6, %13
+  br label %15
+15:                                               ; preds = %3, %1
+  %16 = phi i32 [ 0, %1 ], [ %14, %3 ]
+  ret i32 %16
+}
+
+define dso_local noundef i32 @foo(i32 noundef %0) #1 !dbg !20 {
+  %2 = tail call noundef i32 @bar(i32 noundef %0), !dbg !24
+  ret i32 %2
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable  "use-sample-profile" }
+attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable  "use-sample-profile" }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "a.cc", directory: ".")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(name: "bar", linkageName: "bar", scope: !1, file: !1, line: 1, type: !12, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!11 = !DIFile(filename: "a.cc", directory: ".")
+!12 = !DISubroutineType(types: !13)
+!13 = !{!14, !14}
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !11, file: !11, line: 5, type: !12, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!23 = !DILocation(line: 0, scope: !20)
+!24 = !DILocation(line: 6, column: 12, scope: !20)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
index 18cbd857d97bb..2cd9abf0e11e9 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll
@@ -98,7 +98,7 @@ if.end:
 ;YAML-NEXT:    - String:          '(cost='
 ;YAML-NEXT:    - Cost:            '15'
 ;YAML-NEXT:    - String:          ', threshold='
-;YAML-NEXT:    - Threshold:       '2147483647'
+;YAML-NEXT:    - Threshold:       '3000'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:    - String:          ' at callsite '
 ;YAML-NEXT:    - String:          foo
diff --git a/llvm/test/Transforms/SampleProfile/remarks.ll b/llvm/test/Transforms/SampleProfile/remarks.ll
index 997e02bb5b544..9c0143ae65ca7 100644
--- a/llvm/test/Transforms/SampleProfile/remarks.ll
+++ b/llvm/test/Transforms/SampleProfile/remarks.ll
@@ -22,7 +22,7 @@
 
 ; We are expecting foo() to be inlined in main() (almost all the cycles are
 ; spent inside foo).
-; CHECK: remark: remarks.cc:13:21: '_Z3foov' inlined into 'main' to match profiling context with (cost=130, threshold=2147483647) at callsite main:0:21;
+; CHECK: remark: remarks.cc:13:21: '_Z3foov' inlined into 'main' to match profiling context with (cost=130, threshold=3000) at callsite main:0:21;
 ; CHECK: remark: remarks.cc:9:19: 'rand' inlined into 'main' to match profiling context with (cost=always): always inline attribute at callsite _Z3foov:6:19 @ main:0:21;
 
 ; The back edge for the loop is the hottest edge in the loop subgraph.
@@ -51,7 +51,7 @@
 ;YAML-NEXT:    - String:          '(cost='
 ;YAML-NEXT:    - Cost:            '130'
 ;YAML-NEXT:    - String:          ', threshold='
-;YAML-NEXT:    - Threshold:       '2147483647'
+;YAML-NEXT:    - Threshold:       '3000'
 ;YAML-NEXT:    - String:          ')'
 ;YAML-NEXT:    - String:          ' at callsite '
 ;YAML-NEXT:    - String:          main
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 5cbda8a1e112e..c2e9be5688967 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -15,9 +15,7 @@ define <8 x i8> @trivial(<8 x i8> %a) {
 
 define <4 x i32> @add_same_operands(<4 x i32> %x) {
 ; CHECK-LABEL: @add_same_operands(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[SHUF]], [[SHUF]]
-; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[ADD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = add <4 x i32> [[X:%.*]], [[X]]
 ; CHECK-NEXT:    ret <4 x i32> [[REVSHUF]]
 ;
   %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -364,8 +362,7 @@ define <8 x i8> @inner_shuffle(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
 define <4 x i32> @extrause_add_same_operands(<4 x i32> %x) {
 ; CHECK-LABEL: @extrause_add_same_operands(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[SHUF]], [[SHUF]]
-; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[ADD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = add <4 x i32> [[X]], [[X]]
 ; CHECK-NEXT:    [[ADD2:%.*]] = add <4 x i32> [[SHUF]], [[REVSHUF]]
 ; CHECK-NEXT:    ret <4 x i32> [[ADD2]]
 ;
@@ -513,9 +510,7 @@ define <8 x half> @fma(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 
 define <4 x i64> @single_zext(<4 x i32> %x) {
 ; CHECK-LABEL: @single_zext(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[SHUF]] to <4 x i64>
-; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i64> [[ZEXT]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = zext <4 x i32> [[X:%.*]] to <4 x i64>
 ; CHECK-NEXT:    ret <4 x i64> [[REVSHUF]]
 ;
   %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -570,19 +565,10 @@ define <8 x i16> @not_bitcast2(<4 x i32> %x, <8 x i16> %y) {
 
 define void @exttrunc(<8 x i32> %a, <8 x i32> %b, ptr %p) {
 ; CHECK-LABEL: @exttrunc(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[AB1:%.*]] = zext <4 x i32> [[AB]] to <4 x i64>
-; CHECK-NEXT:    [[AT1:%.*]] = zext <4 x i32> [[AT]] to <4 x i64>
-; CHECK-NEXT:    [[BB1:%.*]] = sext <4 x i32> [[BB]] to <4 x i64>
-; CHECK-NEXT:    [[BT1:%.*]] = sext <4 x i32> [[BT]] to <4 x i64>
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i64> [[AB1]], [[BB1]]
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i64> [[AT1]], [[BT1]]
-; CHECK-NEXT:    [[ABB1:%.*]] = trunc <4 x i64> [[ABB]] to <4 x i32>
-; CHECK-NEXT:    [[ABT1:%.*]] = trunc <4 x i64> [[ABT]] to <4 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB1]], <4 x i32> [[ABT1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i32>
 ; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
@@ -605,17 +591,9 @@ define void @exttrunc(<8 x i32> %a, <8 x i32> %b, ptr %p) {
 
 define void @zext(<8 x i16> %a, <8 x i16> %b, ptr %p) {
 ; CHECK-LABEL: @zext(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[AB1:%.*]] = zext <4 x i16> [[AB]] to <4 x i32>
-; CHECK-NEXT:    [[AT1:%.*]] = zext <4 x i16> [[AT]] to <4 x i32>
-; CHECK-NEXT:    [[BB1:%.*]] = zext <4 x i16> [[BB]] to <4 x i32>
-; CHECK-NEXT:    [[BT1:%.*]] = zext <4 x i16> [[BT]] to <4 x i32>
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i32> [[AB1]], [[BB1]]
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i32> [[AT1]], [[BT1]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB]], <4 x i32> [[ABT]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[B:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
@@ -636,17 +614,9 @@ define void @zext(<8 x i16> %a, <8 x i16> %b, ptr %p) {
 
 define void @sext(<8 x i16> %a, <8 x i16> %b, ptr %p) {
 ; CHECK-LABEL: @sext(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[AB1:%.*]] = sext <4 x i16> [[AB]] to <4 x i32>
-; CHECK-NEXT:    [[AT1:%.*]] = sext <4 x i16> [[AT]] to <4 x i32>
-; CHECK-NEXT:    [[BB1:%.*]] = sext <4 x i16> [[BB]] to <4 x i32>
-; CHECK-NEXT:    [[BT1:%.*]] = sext <4 x i16> [[BT]] to <4 x i32>
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i32> [[AB1]], [[BB1]]
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i32> [[AT1]], [[BT1]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB]], <4 x i32> [[ABT]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[B:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
@@ -705,11 +675,7 @@ define void @zext_types(<8 x i16> %a, <8 x i32> %b, ptr %p) {
 
 define void @trunc(<8 x i64> %a, <8 x i64> %b, ptr %p) {
 ; CHECK-LABEL: @trunc(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i64> [[A:%.*]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i64> [[A]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[ABB1:%.*]] = trunc <4 x i64> [[AB]] to <4 x i32>
-; CHECK-NEXT:    [[ABT1:%.*]] = trunc <4 x i64> [[AT]] to <4 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[ABB1]], <4 x i32> [[ABT1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i64> [[A:%.*]] to <8 x i32>
 ; CHECK-NEXT:    store <8 x i32> [[R]], ptr [[P:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
@@ -724,10 +690,8 @@ define void @trunc(<8 x i64> %a, <8 x i64> %b, ptr %p) {
 
 define <4 x i64> @zext_chain(<4 x i16> %x) {
 ; CHECK-LABEL: @zext_chain(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i16> [[X:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[SHUF]] to <4 x i32>
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i32> [[ZEXT]] to <4 x i64>
-; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i64> [[SEXT]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i16> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64>
 ; CHECK-NEXT:    ret <4 x i64> [[REVSHUF]]
 ;
   %shuf = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -928,13 +892,11 @@ entry:
 
 define <4 x i8> @singleop(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-LABEL: @singleop(
-; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i8> [[A:%.*]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[B1:%.*]] = shufflevector <4 x i8> [[B:%.*]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[A2:%.*]] = zext <4 x i8> [[A1]] to <4 x i16>
-; CHECK-NEXT:    [[B2:%.*]] = zext <4 x i8> [[B1]] to <4 x i16>
-; CHECK-NEXT:    [[AB:%.*]] = add <4 x i16> [[A2]], [[B2]]
-; CHECK-NEXT:    [[T:%.*]] = trunc <4 x i16> [[AB]] to <4 x i8>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[T]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[B:%.*]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[A:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = trunc <4 x i16> [[TMP4]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[R]]
 ;
   %a1 = shufflevector <4 x i8> %a, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
index bb370a6d1dfeb..7f7790cecb0eb 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
@@ -670,7 +670,7 @@ declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
 define amdgpu_cs void @test_buffer_atomic_fadd(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %offset, i1 %slc) {
   ; CHECK: immarg operand has non-immediate parameter
   ; CHECK-NEXT: i1 %slc
-  ; CHECK-ENXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc)
+  ; CHECK-NEXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc)
   call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc)
   ret void
 }
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index affd87b98c141..fe1262893212f 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -617,3 +617,6 @@ def have_ld64_plugin_support():
 # "OBJECT_MODE" to 'any' by default on AIX OS.
 if "system-aix" in config.available_features:
     config.environment["OBJECT_MODE"] = "any"
+
+if config.has_logf128:
+    config.available_features.add("has_logf128")
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index 60a68b0edaf93..0968f6214772d 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -63,6 +63,7 @@ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
 config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
 config.have_vc_rev = @LLVM_APPEND_VC_REV@
 config.force_vc_rev = "@LLVM_FORCE_VC_REVISION@"
+config.has_logf128 = @LLVM_HAS_LOGF128@
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s
index 653f544e36ce2..1db28a84e2ff6 100644
--- a/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s
+++ b/llvm/test/tools/llvm-exegesis/AArch64/latency-by-opcode-name.s
@@ -10,4 +10,4 @@ CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG2]]=0x0'
 # We don't check REG3 because in the case that REG2=REG3 the check would fail
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s b/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s
index f9b4860c3f4a0..cc2cf20ce05f4 100644
--- a/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s
+++ b/llvm/test/tools/llvm-exegesis/Mips/latency-GPR64.s
@@ -9,4 +9,4 @@ CHECK-NEXT:     AND64
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+_64]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s
index f3853eaa62ea7..dcbbd3cf7fc35 100644
--- a/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s
+++ b/llvm/test/tools/llvm-exegesis/Mips/latency-by-opcode-name.s
@@ -9,4 +9,4 @@ CHECK-NEXT:     ADD
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s
index 3d457aeb59276..c4d9fcf2e0613 100644
--- a/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s
+++ b/llvm/test/tools/llvm-exegesis/PowerPC/latency-by-opcode-name.s
@@ -8,4 +8,4 @@ CHECK-NEXT:     ADD8
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s
index 9cdd9bf029d02..384f9f1d8cf9e 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-CMOV32rr.s
@@ -8,4 +8,4 @@ CHECK-NEXT: key:
 CHECK-NEXT:   instructions:
 CHECK-NEXT:     'CMOV32rr {{.*}} i_0x{{[0-9a-f]}}'
 CHECK-NEXT: config: ''
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s
index 8b4f42dd32015..c82f5c884b992 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-IN16rr.s
@@ -12,4 +12,4 @@ CHECK-NEXT:     - {{.*}}
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s
index c20e687cf20d2..26c4391bc99d6 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SBB8rr.s
@@ -9,4 +9,4 @@ CHECK-NEXT:     SBB8rr
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s
index 7e67a4343f4e6..bf97a40c4bf0d 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-SQRTSSr.s
@@ -10,4 +10,4 @@ CHECK-NEXT:     SQRTSSr
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-NOT: crashed
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s b/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s
index 4fee6fe927097..08beccfe7704f 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/latency-by-opcode-name.s
@@ -9,4 +9,4 @@ CHECK-NEXT:     ADD32rr
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test b/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test
index 382e742144ac4..f27101d896608 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/max-configs.test
@@ -9,7 +9,7 @@ CHECK-NEXT:     SBB8rr
 CHECK-NEXT: config: ''
 CHECK-NEXT: register_initial_values:
 CHECK-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK-LAST: ...
+CHECK-DAG: ...
 
 CHECK1-NOT: SBB8rr
 
@@ -21,4 +21,4 @@ CHECK2-NEXT:     SBB8rr
 CHECK2-NEXT: config: ''
 CHECK2-NEXT: register_initial_values:
 CHECK2-DAG: - '[[REG1:[A-Z0-9]+]]=0x0'
-CHECK2-LAST: ...
+CHECK2-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s
index af1662d93a744..2a8cc8e34450a 100644
--- a/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s
+++ b/llvm/test/tools/llvm-exegesis/X86/lbr/mov-add.s
@@ -16,4 +16,4 @@ CHECK-NEXT: {{.*}}
 CHECK-NEXT: num_repetitions: 10000
 CHECK-NEXT: measurements:
 CHECK-NEXT: {{.*}} value: 0.0001, per_snippet_value: 0.0002 {{.*}}
-CHECK-LAST: ...
+CHECK-DAG: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s b/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s
index 302c2b0ee722b..1e673e806da21 100644
--- a/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s
+++ b/llvm/test/tools/llvm-exegesis/X86/uops/uops-CMOV16rm-noreg-serialization.s
@@ -8,4 +8,4 @@ CHECK-YAML-NEXT: mode:            uops
 CHECK-YAML-NEXT: key:
 CHECK-YAML-NEXT:   instructions:
 CHECK-YAML-NEXT:     - 'CMOV16rm {{[A-Z0-9]+}} {{[A-Z0-9]+}} {{[A-Z0-9]+}} i_0x1 %noreg i_0x0 %noreg i_0x{{[0-9a-f]}}'
-CHECK-YAML-LAST: ...
+CHECK-YAML-DAG: ...
diff --git a/llvm/test/tools/llvm-profdata/memprof-merge-v0.test b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test
similarity index 77%
rename from llvm/test/tools/llvm-profdata/memprof-merge-v0.test
rename to llvm/test/tools/llvm-profdata/memprof-merge-versions.test
index 28f65e0781bc6..aa7d0329425dc 100644
--- a/llvm/test/tools/llvm-profdata/memprof-merge-v0.test
+++ b/llvm/test/tools/llvm-profdata/memprof-merge-versions.test
@@ -19,6 +19,12 @@ RUN: llvm-profdata show %t.prof.v2 | FileCheck %s
 RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=2 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v2
 RUN: llvm-profdata show %t.prof.v2 | FileCheck %s
 
+RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=3 --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v3
+RUN: llvm-profdata show %t.prof.v3 | FileCheck %s
+
+RUN: llvm-profdata merge %t.proftext %p/Inputs/basic.memprofraw --memprof-version=3 --memprof-full-schema --profiled-binary %p/Inputs/basic.memprofexe -o %t.prof.v3
+RUN: llvm-profdata show %t.prof.v3 | FileCheck %s
+
 For now we only check the validity of the instrumented profile since we don't
 have a way to display the contents of the memprof indexed format yet.
 
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 28c3afa101647..fae6d1e989ab5 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -306,7 +306,8 @@ cl::opt<memprof::IndexedVersion> MemProfVersionRequested(
     cl::init(memprof::Version0),
     cl::values(clEnumValN(memprof::Version0, "0", "version 0"),
                clEnumValN(memprof::Version1, "1", "version 1"),
-               clEnumValN(memprof::Version2, "2", "version 2")));
+               clEnumValN(memprof::Version2, "2", "version 2"),
+               clEnumValN(memprof::Version3, "3", "version 3")));
 
 cl::opt<bool> MemProfFullSchema(
     "memprof-full-schema", cl::Hidden, cl::sub(MergeSubcommand),
diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
index b79458529623f..5efa7d2722d3f 100644
--- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
@@ -116,6 +116,24 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
      << "#endif // EMIT_EXTENSIONS\n"
      << "\n";
 
+  // Emit extension dependencies
+  OS << "#ifdef EMIT_EXTENSION_DEPENDENCIES\n"
+     << "inline constexpr ExtensionDependency ExtensionDependencies[] = {\n";
+  for (const Record *Rec : SortedExtensions) {
+    auto LaterAEK = Rec->getValueAsString("ArchExtKindSpelling").upper();
+    for (const Record *I : Rec->getValueAsListOfDefs("Implies"))
+      if (auto EarlierAEK = I->getValueAsOptionalString("ArchExtKindSpelling"))
+        OS << "  {" << EarlierAEK->upper() << ", " << LaterAEK << "},\n";
+  }
+  // FIXME: Tablegen has the Subtarget Feature FeatureRCPC_IMMO which is implied
+  // by FeatureRCPC3 and in turn implies FeatureRCPC. The proper fix is to make
+  // FeatureRCPC_IMMO an Extension but that will expose it to the command line.
+  OS << "  {AEK_RCPC, AEK_RCPC3},\n";
+  OS << "};\n"
+     << "#undef EMIT_EXTENSION_DEPENDENCIES\n"
+     << "#endif // EMIT_EXTENSION_DEPENDENCIES\n"
+     << "\n";
+
   // Emit architecture information
   OS << "#ifdef EMIT_ARCHITECTURES\n";
 
diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
index a8970d8bcbacd..d89a1f078328b 100644
--- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
+++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
@@ -277,8 +277,22 @@ void X86InstrMappingEmitter::emitNFTransformTable(
     if (Pos == std::string::npos)
       continue;
 
-    if (auto *NewRec = Records.getDef(Name.erase(Pos, 3)))
+    if (auto *NewRec = Records.getDef(Name.erase(Pos, 3))) {
+#ifndef NDEBUG
+      auto ClobberEFLAGS = [](const Record *R) {
+        return llvm::any_of(
+            R->getValueAsListOfDefs("Defs"),
+            [](const Record *Def) { return Def->getName() == "EFLAGS"; });
+      };
+      if (ClobberEFLAGS(Rec))
+        report_fatal_error("EFLAGS should not be clobbered by " +
+                           Rec->getName());
+      if (!ClobberEFLAGS(NewRec))
+        report_fatal_error("EFLAGS should be clobbered by " +
+                           NewRec->getName());
+#endif
       Table.push_back(std::pair(&Target.getInstruction(NewRec), Inst));
+    }
   }
   printTable(Table, "X86NFTransformTable", "GET_X86_NF_TRANSFORM_TABLE", OS);
 }
diff --git a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
index 665a394f57a6a..cab601bf8131f 100644
--- a/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
+++ b/llvm/utils/TableGen/X86ManualCompressEVEXTables.def
@@ -48,6 +48,14 @@ NOCOMP(VPSRAQZ256ri)
 NOCOMP(VPSRAQZ256rm)
 NOCOMP(VPSRAQZ256rr)
 NOCOMP(VSCALEFPSZ256rm)
+// When condition evaluates to false, the destination register is zeroed for
+// nonNDD CFCMOV but not for NDD CFCMOV.
+NOCOMP(CFCMOV16rm_ND)
+NOCOMP(CFCMOV16rr_ND)
+NOCOMP(CFCMOV32rm_ND)
+NOCOMP(CFCMOV32rr_ND)
+NOCOMP(CFCMOV64rm_ND)
+NOCOMP(CFCMOV64rr_ND)
 #undef NOCOMP
 
 #ifndef ENTRY
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index b642b2c82e6d8..8a5f6d1908784 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -289,7 +289,6 @@ if (current_toolchain == default_toolchain) {
       "__atomic/kill_dependency.h",
       "__atomic/memory_order.h",
       "__atomic/to_gcc_order.h",
-      "__availability",
       "__bit/bit_cast.h",
       "__bit/bit_ceil.h",
       "__bit/bit_floor.h",
@@ -384,6 +383,11 @@ if (current_toolchain == default_toolchain) {
       "__concepts/totally_ordered.h",
       "__condition_variable/condition_variable.h",
       "__config",
+      "__configuration/abi.h",
+      "__configuration/availability.h",
+      "__configuration/compiler.h",
+      "__configuration/language.h",
+      "__configuration/platform.h",
       "__coroutine/coroutine_handle.h",
       "__coroutine/coroutine_traits.h",
       "__coroutine/noop_coroutine_handle.h",
diff --git a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
index c99c1b5483355..f0bf6a8f3dbaf 100644
--- a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn
@@ -40,6 +40,8 @@ target(liblldb_type, "liblldb") {
   include_dirs = [ ".." ]
   sources = [
     "SBAddress.cpp",
+    "SBAddressRange.cpp",
+    "SBAddressRangeList.cpp",
     "SBAttachInfo.cpp",
     "SBBlock.cpp",
     "SBBreakpoint.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
index 30a9fb3ecceaa..0c9632a0a1915 100644
--- a/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
@@ -45,6 +45,7 @@ static_library("Core") {
   sources = [
     "Address.cpp",
     "AddressRange.cpp",
+    "AddressRangeListImpl.cpp",
     "AddressResolver.cpp",
     "AddressResolverFileLine.cpp",
     "Communication.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index e93130eacdc74..d8266fee05014 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -318,6 +318,7 @@ write_cmake_config("llvm-config") {
     "LLVM_ENABLE_ZSTD=",
     "LLVM_FORCE_USE_OLD_TOOLCHAIN=",
     "LLVM_HAS_ATOMICS=1",
+    "LLVM_HAS_LOGF128=",
     "LLVM_HAVE_TFLITE=",
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
     "LLVM_NATIVE_ARCH=$native_target",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
index 78a9d20812ef9..8264f6d73e791 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
@@ -12,9 +12,9 @@ tablegen("X86GenDAGISel") {
   td_file = "X86.td"
 }
 
-tablegen("X86GenCompressEVEXTables") {
+tablegen("X86GenInstrMapping") {
   visibility = [ ":LLVMX86CodeGen" ]
-  args = [ "-gen-x86-compress-evex-tables" ]
+  args = [ "-gen-x86-instr-mapping" ]
   td_file = "X86.td"
 }
 
@@ -48,11 +48,11 @@ tablegen("X86GenRegisterBank") {
 static_library("LLVMX86CodeGen") {
   deps = [
     ":X86GenCallingConv",
-    ":X86GenCompressEVEXTables",
     ":X86GenDAGISel",
     ":X86GenFastISel",
     ":X86GenFoldTables",
     ":X86GenGlobalISel",
+    ":X86GenInstrMapping",
     ":X86GenRegisterBank",
     "MCTargetDesc",
     "TargetInfo",
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index 826dcf4e6ee9b..60d6d7b8c3ce7 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -64,6 +64,7 @@ write_lit_config("lit_site_cfg") {
     "LLVM_ENABLE_HTTPLIB=0",
     "LLVM_ENABLE_ZSTD=0",
     "LLVM_FORCE_VC_REVISION=",
+    "LLVM_HAS_LOGF128=0",
     "LLVM_HAVE_OPT_VIEWER_MODULES=0",
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
     "LLVM_INCLUDE_DXIL_TESTS=0",
diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
index f3ae5b5899ac6..2e11d25767cd0 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn
@@ -64,7 +64,7 @@ executable("llvm-tblgen") {
     "SearchableTableEmitter.cpp",
     "SubtargetEmitter.cpp",
     "WebAssemblyDisassemblerEmitter.cpp",
-    "X86CompressEVEXTablesEmitter.cpp",
+    "X86InstrMappingEmitter.cpp",
     "X86DisassemblerTables.cpp",
     "X86FoldTablesEmitter.cpp",
     "X86MnemonicTables.cpp",
diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim
index d86e3d1ddbc27..905d696400ca3 100644
--- a/llvm/utils/vim/syntax/llvm.vim
+++ b/llvm/utils/vim/syntax/llvm.vim
@@ -150,6 +150,7 @@ syn keyword llvmKeyword
       \ preallocated
       \ private
       \ protected
+      \ ptrauth
       \ ptx_device
       \ ptx_kernel
       \ readnone
diff --git a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h
index 8bd7cf880c6af..191c023fb642c 100644
--- a/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/IntegerRangeAnalysis.h
@@ -24,51 +24,6 @@
 namespace mlir {
 namespace dataflow {
 
-/// This lattice value represents the integer range of an SSA value.
-class IntegerValueRange {
-public:
-  /// Create a maximal range ([0, uint_max(t)] / [int_min(t), int_max(t)])
-  /// range that is used to mark the value as unable to be analyzed further,
-  /// where `t` is the type of `value`.
-  static IntegerValueRange getMaxRange(Value value);
-
-  /// Create an integer value range lattice value.
-  IntegerValueRange(std::optional<ConstantIntRanges> value = std::nullopt)
-      : value(std::move(value)) {}
-
-  /// Whether the range is uninitialized. This happens when the state hasn't
-  /// been set during the analysis.
-  bool isUninitialized() const { return !value.has_value(); }
-
-  /// Get the known integer value range.
-  const ConstantIntRanges &getValue() const {
-    assert(!isUninitialized());
-    return *value;
-  }
-
-  /// Compare two ranges.
-  bool operator==(const IntegerValueRange &rhs) const {
-    return value == rhs.value;
-  }
-
-  /// Take the union of two ranges.
-  static IntegerValueRange join(const IntegerValueRange &lhs,
-                                const IntegerValueRange &rhs) {
-    if (lhs.isUninitialized())
-      return rhs;
-    if (rhs.isUninitialized())
-      return lhs;
-    return IntegerValueRange{lhs.getValue().rangeUnion(rhs.getValue())};
-  }
-
-  /// Print the integer value range.
-  void print(raw_ostream &os) const { os << value; }
-
-private:
-  /// The known integer value range.
-  std::optional<ConstantIntRanges> value;
-};
-
 /// This lattice element represents the integer value range of an SSA value.
 /// When this lattice is updated, it automatically updates the constant value
 /// of the SSA value (if the range can be narrowed to one).
diff --git a/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h b/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h
index 3953c83f3aa10..76a4b1b156336 100644
--- a/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h
+++ b/mlir/include/mlir/Conversion/TosaToTensor/TosaToTensor.h
@@ -16,6 +16,7 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+class TypeConverter;
 
 #define GEN_PASS_DECL_TOSATOTENSOR
 #include "mlir/Conversion/Passes.h.inc"
@@ -24,7 +25,8 @@ namespace tosa {
 
 std::unique_ptr<Pass> createTosaToTensor();
 
-void populateTosaToTensorConversionPatterns(RewritePatternSet *patterns);
+void populateTosaToTensorConversionPatterns(TypeConverter &converter,
+                                            RewritePatternSet *patterns);
 
 } // namespace tosa
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index ead52332e8eec..81ed0f924a2e2 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -49,7 +49,7 @@ class Arith_BinaryOp<string mnemonic, list<Trait> traits = []> :
 // Base class for integer binary operations.
 class Arith_IntBinaryOp<string mnemonic, list<Trait> traits = []> :
     Arith_BinaryOp<mnemonic, traits #
-      [DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs)>,
     Results<(outs SignlessIntegerLike:$result)>;
 
@@ -83,12 +83,25 @@ class Arith_FloatBinaryOp<string mnemonic, list<Trait> traits = []> :
                           attr-dict `:` type($result) }];
 }
 
+// Checks that tensor input and outputs have identical shapes. This is stricker
+// than the verification done in `SameOperandsAndResultShape` that allows for
+// tensor dimensions to be 'compatible' (e.g., dynamic dimensions being
+// compatible with static ones).
+def SameInputOutputTensorDims : PredOpTrait<
+    "input and output have the same tensor dimensions",
+    AllMatchSameOperatorPred<["in", "out"],
+      "(::llvm::isa<::mlir::TensorType>($_self.getType()) ?"
+      " ::llvm::cast<::mlir::TensorType>($_self.getType()).getShape() :"
+      " ::llvm::ArrayRef<int64_t>{})">>;
+
 // Base class for arithmetic cast operations. Requires a single operand and
-// result. If either is a shaped type, then the other must be of the same shape.
+// result. If either is a shaped type, then the other must be of the same
+// shape.  In the case of tensor types, this also includes the corresponding
+// operand/result dimensions being equal.
 class Arith_CastOp<string mnemonic, TypeConstraint From, TypeConstraint To,
                    list<Trait> traits = []> :
     Arith_Op<mnemonic, traits # [Pure, SameOperandsAndResultShape,
-      DeclareOpInterfaceMethods<CastOpInterface>]>,
+      SameInputOutputTensorDims, DeclareOpInterfaceMethods<CastOpInterface>]>,
     Arguments<(ins From:$in)>,
     Results<(outs To:$out)> {
   let assemblyFormat = "$in attr-dict `:` type($in) `to` type($out)";
@@ -107,7 +120,7 @@ class Arith_IToICastOp<string mnemonic, list<Trait> traits = []> :
     Arith_CastOp<mnemonic, SignlessFixedWidthIntegerLike,
                            SignlessFixedWidthIntegerLike,
                            traits #
-                           [DeclareOpInterfaceMethods<InferIntRangeInterface>]>;
+                           [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>;
 // Cast from an integer type to a floating point type.
 class Arith_IToFCastOp<string mnemonic, list<Trait> traits = []> :
     Arith_CastOp<mnemonic, SignlessFixedWidthIntegerLike, FloatLike, traits>;
@@ -139,7 +152,7 @@ class Arith_CompareOpOfAnyRank<string mnemonic, list<Trait> traits = []> :
 
 class Arith_IntBinaryOpWithOverflowFlags<string mnemonic, list<Trait> traits = []> :
     Arith_BinaryOp<mnemonic, traits #
-      [Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>,
+      [Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
        DeclareOpInterfaceMethods<ArithIntegerOverflowFlagsInterface>]>,
     Arguments<(ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs,
       DefaultValuedAttr<
@@ -159,7 +172,7 @@ def Arith_ConstantOp : Op<Arith_Dialect, "constant",
     [ConstantLike, Pure,
      DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
      AllTypesMatch<["value", "result"]>,
-     DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+     DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "integer or floating point constant";
   let description = [{
     The `constant` operation produces an SSA value equal to some integer or
@@ -1231,7 +1244,7 @@ def Arith_TruncIOp : Arith_IToICastOp<"trunci"> {
 
 def Arith_TruncFOp :
     Arith_Op<"truncf",
-      [Pure, SameOperandsAndResultShape,
+      [Pure, SameOperandsAndResultShape, SameInputOutputTensorDims,
        DeclareOpInterfaceMethods<ArithRoundingModeInterface>,
        DeclareOpInterfaceMethods<CastOpInterface>]>,
     Arguments<(ins FloatLike:$in,
@@ -1327,7 +1340,7 @@ def IndexCastTypeConstraint : TypeConstraint<Or<[
 
 def Arith_IndexCastOp
   : Arith_CastOp<"index_cast", IndexCastTypeConstraint, IndexCastTypeConstraint,
-                 [DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+                 [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "cast between index and integer types";
   let description = [{
     Casts between scalar or vector integers and corresponding 'index' scalar or
@@ -1346,7 +1359,7 @@ def Arith_IndexCastOp
 
 def Arith_IndexCastUIOp
   : Arith_CastOp<"index_castui", IndexCastTypeConstraint, IndexCastTypeConstraint,
-                 [DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+                 [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "unsigned cast between index and integer types";
   let description = [{
     Casts between scalar or vector integers and corresponding 'index' scalar or
@@ -1400,7 +1413,7 @@ def Arith_BitcastOp : Arith_CastOp<"bitcast", BitcastTypeConstraint,
 
 def Arith_CmpIOp
   : Arith_CompareOpOfAnyRank<"cmpi",
-                             [DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+                             [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let summary = "integer comparison operation";
   let description = [{
     The `cmpi` operation is a generic comparison for integer-like types. Its two
@@ -1555,7 +1568,7 @@ class ScalarConditionOrMatchingShape<list<string> names> :
 def SelectOp : Arith_Op<"select", [Pure,
     AllTypesMatch<["true_value", "false_value", "result"]>,
     ScalarConditionOrMatchingShape<["condition", "result"]>,
-    DeclareOpInterfaceMethods<InferIntRangeInterface>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRangesFromOptional"]>,
   ] # ElementwiseMappable.traits> {
   let summary = "select operation";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
index cbc6147cb81e2..9dc262cc72ed0 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
@@ -24,9 +24,6 @@ namespace arith {
 class WideIntEmulationConverter;
 class NarrowTypeEmulationConverter;
 
-/// Create a pass to bufferize arith.constant ops.
-std::unique_ptr<Pass> createConstantBufferizePass(uint64_t alignment = 0);
-
 /// Adds patterns to emulate wide Arith and Function ops over integer
 /// types into supported ones. This is done by splitting original power-of-two
 /// i2N integer types into two iN halves.
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
index 4096e309199e9..550c5c0cf4f60 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
@@ -11,22 +11,6 @@
 
 include "mlir/Pass/PassBase.td"
 
-def ArithBufferizePass : Pass<"arith-bufferize", "ModuleOp"> {
-  let summary = "Bufferize Arith dialect ops.";
-  let description = [{
-    This pass bufferizes arith dialect ops.
-
-    This pass needs to be a module pass because it inserts memref.global
-    ops into the module, which cannot be done safely from a function pass due to
-    multi-threading. Most other bufferization passes can run in parallel at
-    function granularity.
-  }];
-  let options = [
-    Option<"alignment", "alignment", "unsigned", /*default=*/"0",
-           "Create global memrefs with a specified alignment">,
-  ];
-}
-
 def ArithExpandOpsPass : Pass<"arith-expand"> {
   let summary = "Legalize Arith ops to be convertible to LLVM.";
   let dependentDialects = ["vector::VectorDialect"];
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index 459c252b70712..e053e6c97e143 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -221,9 +221,6 @@ createPromoteBuffersToStackPass(std::function<bool(Value)> isSmallAlloc);
 /// insert_slice ops.
 std::unique_ptr<Pass> createEmptyTensorEliminationPass();
 
-/// Create a pass that bufferizes ops from the bufferization dialect.
-std::unique_ptr<Pass> createBufferizationBufferizePass();
-
 //===----------------------------------------------------------------------===//
 // Registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index 75ce85c9128c9..8f8826b9ad56b 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -350,11 +350,6 @@ def FinalizingBufferize : Pass<"finalizing-bufferize", "func::FuncOp"> {
   let constructor = "mlir::bufferization::createFinalizingBufferizePass()";
 }
 
-def BufferizationBufferize : Pass<"bufferization-bufferize", "func::FuncOp"> {
-  let summary = "Bufferize the `bufferization` dialect";
-  let constructor = "mlir::bufferization::createBufferizationBufferizePass()";
-}
-
 def DropEquivalentBufferResults : Pass<"drop-equivalent-buffer-results", "ModuleOp">  {
   let summary = "Remove MemRef return values that are equivalent to a bbArg";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 1da68ed2176d8..10719aae5c8b4 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -52,7 +52,7 @@ def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">;
 class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
     GPU_Op<mnemonic, !listconcat(traits, [
         Pure,
-        DeclareOpInterfaceMethods<InferIntRangeInterface>,
+        DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
         DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>])>,
     Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> {
   let assemblyFormat = "$dimension attr-dict";
@@ -144,7 +144,7 @@ def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
 }
 
 def GPU_LaneIdOp : GPU_Op<"lane_id", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]> {
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]> {
   let description = [{
     Returns the lane id within the subgroup (warp/wave).
 
@@ -158,7 +158,7 @@ def GPU_LaneIdOp : GPU_Op<"lane_id", [
 }
 
 def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins)>, Results<(outs Index:$result)> {
   let description = [{
     Returns the subgroup id, i.e., the index of the current subgroup within the
@@ -190,7 +190,7 @@ def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
 
 
 def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins)>, Results<(outs Index:$result)> {
   let description = [{
     Returns the number of subgroups within a workgroup.
@@ -206,7 +206,7 @@ def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
 }
 
 def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
-      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface>]>,
+      Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
     Arguments<(ins)>, Results<(outs Index:$result)> {
   let description = [{
     Returns the number of threads within a subgroup.
@@ -687,7 +687,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
 
 def GPU_LaunchOp : GPU_Op<"launch", [
       AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface,
-      DeclareOpInterfaceMethods<InferIntRangeInterface>,
+      DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
       RecursiveMemoryEffects]>,
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
diff --git a/mlir/include/mlir/Dialect/Index/IR/IndexOps.td b/mlir/include/mlir/Dialect/Index/IR/IndexOps.td
index c6079cb8a98c8..a30ae9f739cbc 100644
--- a/mlir/include/mlir/Dialect/Index/IR/IndexOps.td
+++ b/mlir/include/mlir/Dialect/Index/IR/IndexOps.td
@@ -25,7 +25,7 @@ include "mlir/IR/OpBase.td"
 /// Base class for Index dialect operations.
 class IndexOp<string mnemonic, list<Trait> traits = []>
     : Op<IndexDialect, mnemonic,
-      [DeclareOpInterfaceMethods<InferIntRangeInterface>] # traits>;
+      [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>] # traits>;
 
 //===----------------------------------------------------------------------===//
 // IndexBinaryOp
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index 535cf8dfd2ced..bfcfbd64ae021 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -1037,4 +1037,25 @@ def LLVM_TargetFeaturesAttr : LLVM_Attr<"TargetFeatures", "target_features">
   let genVerifyDecl = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// UndefAttr
+//===----------------------------------------------------------------------===//
+
+/// Folded into from LLVM::UndefOp.
+def LLVM_UndefAttr : LLVM_Attr<"Undef", "undef">;
+
+//===----------------------------------------------------------------------===//
+// PoisonAttr
+//===----------------------------------------------------------------------===//
+
+/// Folded into from LLVM::PoisonOp.
+def LLVM_PoisonAttr : LLVM_Attr<"Poison", "poison">;
+
+//===----------------------------------------------------------------------===//
+// ZeroAttr
+//===----------------------------------------------------------------------===//
+
+/// Folded into from LLVM::ZeroOp.
+def LLVM_ZeroAttr : LLVM_Attr<"Zero", "zero">;
+
 #endif // LLVMIR_ATTRDEFS
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 84e67d2c11dbd..f6f907f39a4b4 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1522,7 +1522,7 @@ def LLVM_NoneTokenOp
   let assemblyFormat = "attr-dict `:` type($res)";
 }
 
-def LLVM_UndefOp : LLVM_Op<"mlir.undef", [Pure]>,
+def LLVM_UndefOp : LLVM_Op<"mlir.undef", [Pure, ConstantLike]>,
                    LLVM_Builder<"$res = llvm::UndefValue::get($_resultType);"> {
   let summary = "Creates an undefined value of LLVM dialect type.";
   let description = [{
@@ -1541,9 +1541,10 @@ def LLVM_UndefOp : LLVM_Op<"mlir.undef", [Pure]>,
   let results = (outs LLVM_Type:$res);
   let builders = [LLVM_OneResultOpBuilder];
   let assemblyFormat = "attr-dict `:` type($res)";
+  let hasFolder = 1;
 }
 
-def LLVM_PoisonOp : LLVM_Op<"mlir.poison", [Pure]>,
+def LLVM_PoisonOp : LLVM_Op<"mlir.poison", [Pure, ConstantLike]>,
                     LLVM_Builder<"$res = llvm::PoisonValue::get($_resultType);"> {
   let summary = "Creates a poison value of LLVM dialect type.";
   let description = [{
@@ -1563,10 +1564,11 @@ def LLVM_PoisonOp : LLVM_Op<"mlir.poison", [Pure]>,
   let results = (outs LLVM_Type:$res);
   let builders = [LLVM_OneResultOpBuilder];
   let assemblyFormat = "attr-dict `:` type($res)";
+  let hasFolder = 1;
 }
 
 def LLVM_ZeroOp
-    : LLVM_Op<"mlir.zero", [Pure]>,
+    : LLVM_Op<"mlir.zero", [Pure, ConstantLike]>,
       LLVM_Builder<"$res = llvm::Constant::getNullValue($_resultType);">
 {
   let summary = "Creates a zero-initialized value of LLVM dialect type.";
@@ -1588,6 +1590,7 @@ def LLVM_ZeroOp
   let builders = [LLVM_OneResultOpBuilder];
   let assemblyFormat = "attr-dict `:` type($res)";
   let hasVerifier = 1;
+  let hasFolder = 1;
 }
 
 def LLVM_ConstantOp
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index eb7dd37010a67..fad234a9dcae9 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -3478,6 +3478,144 @@ structured_op: !LinalgStructuredOpConfig
                 - !ScalarExpression
                   scalar_arg: K
 --- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: conv_2d_ngchw_gfchw_q
+  cpp_class_name: Conv2DNgchwGfchwQOp
+  doc: |-
+    Performs 2-D grouped convolution with zero-point offsets.
+
+    Layout:
+      * Input: NGCHW.
+      * Kernel: GFCHW.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output. This includes the zero
+    point offsets common to quantized operations.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1, s2, s3 * s4 + s5 * s6, s7 * s8 + s9 * s10)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s1, s11, s2, s5, s9)>
+  - !LinalgOperandDefConfig
+    name: IZp
+    kind: scalar
+    type_var: I32
+  - !LinalgOperandDefConfig
+    name: KZp
+    kind: scalar
+    type_var: I32
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1, s11, s3, s7)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s4, s8)>
+    default_indices:
+    - 1
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s6, s10)>
+    default_indices:
+    - 1
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1, d5, d3 * s4 + d6 * s6, d4 * s8 + d7 * s10)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d1, d2, d5, d6, d7)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1, d2, d3, d4)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: binary
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              scalar_fn:
+                kind: binary
+                fn_name: sub
+                operands:
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: I
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: IZp
+            - !ScalarExpression
+              scalar_fn:
+                kind: binary
+                fn_name: sub
+                operands:
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: K
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: KZp
+--- !LinalgOpConfig
 metadata: !LinalgOpMetadata
   name: conv_3d_ndhwc_dhwcf
   cpp_class_name: Conv3DNdhwcDhwcfOp
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
index d36d1e70f0b14..f2955d55e59ec 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.h
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -22,10 +22,6 @@ namespace func {
 class FuncOp;
 } // namespace func
 
-namespace bufferization {
-struct OneShotBufferizationOptions;
-} // namespace bufferization
-
 #define GEN_PASS_DECL
 #include "mlir/Dialect/Linalg/Passes.h.inc" // IWYU pragma: keep
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 0a4ce8953136d..0621a9f33ba1e 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -89,16 +89,6 @@ def LinalgInlineScalarOperandsPass : Pass<"linalg-inline-scalar-operands"> {
   ];
 }
 
-def LinalgBufferizePass : Pass<"linalg-bufferize"> {
-  let summary = "Bufferize the linalg dialect";
-  let dependentDialects = [
-    "affine::AffineDialect",
-    "bufferization::BufferizationDialect",
-    "linalg::LinalgDialect",
-    "memref::MemRefDialect",
-  ];
-}
-
 def LinalgGeneralizeNamedOpsPass : Pass<"linalg-generalize-named-ops"> {
   let summary = "Convert named ops into generic ops";
   let dependentDialects = ["linalg::LinalgDialect"];
diff --git a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h
index cfb637f133f54..28e17459ff962 100644
--- a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.h
@@ -47,13 +47,6 @@ void populateShapeRewritePatterns(RewritePatternSet &patterns);
 void populateRemoveShapeConstraintsPatterns(RewritePatternSet &patterns);
 std::unique_ptr<OperationPass<func::FuncOp>> createRemoveShapeConstraintsPass();
 
-// Bufferizes shape dialect ops.
-//
-// Note that most shape dialect ops must be converted to std before
-// bufferization happens, as they are intended to be bufferized at the std
-// level.
-std::unique_ptr<OperationPass<func::FuncOp>> createShapeBufferizePass();
-
 /// Outline the shape computation part by adding shape.func and populate
 /// conrresponding mapping infomation into ShapeMappingAnalysis.
 std::unique_ptr<OperationPass<ModuleOp>> createOutlineShapeComputationPass();
diff --git a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td
index 9dfda9ea33615..83834509b4a35 100644
--- a/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Shape/Transforms/Passes.td
@@ -103,11 +103,4 @@ def ShapeToShapeLowering : Pass<"shape-to-shape-lowering", "func::FuncOp"> {
   let constructor = "mlir::createShapeToShapeLowering()";
 }
 
-// TODO: Generalize this to allow any type conversions desired.
-def ShapeBufferize : Pass<"shape-bufferize", "func::FuncOp"> {
-  let summary = "Bufferize the shape dialect.";
-  let constructor = "mlir::createShapeBufferizePass()";
-  let dependentDialects = ["bufferization::BufferizationDialect",
-                           "memref::MemRefDialect"];
-}
 #endif // MLIR_DIALECT_SHAPE_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index bb49d6c256f21..d6d038ef65bdf 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -65,12 +65,6 @@ void populateSparseAssembler(RewritePatternSet &patterns, bool directOut);
 std::unique_ptr<Pass> createSparseAssembler();
 std::unique_ptr<Pass> createSparseAssembler(bool directOut);
 
-//===----------------------------------------------------------------------===//
-// The SparseEncodingPropagation pass.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<Pass> createSparseEncodingPropagationPass();
-
 //===----------------------------------------------------------------------===//
 // The SparseReinterpretMap pass.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 94c3ca60030ee..2f844cee5ff52 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -40,42 +40,6 @@ def SparseAssembler : Pass<"sparse-assembler", "ModuleOp"> {
   ];
 }
 
-def SparseEncodingPropagation : Pass<"sparse-encoding-propagation", "func::FuncOp"> {
-  let summary = "Propagate sparse tensor encodings";
-  let description = [{
-    A pass that propagates sparse tensor encodings.
-
-    Background: To avoid introducing repetitive operations, sparse tensors
-    in MLIR try to reuse tensor operations whenever available. However, most
-    tensor operations are canonicalized/transformed without the knowledge
-    of sparsity. The pass tries to propagate missing sparse encodings.
-
-    For example:
-    ```mlir
-    %s = tensor.extract_slice %input[0, 0,] [2, 1] [1, 1]
-       : tensor<2x3xf32, #sparse> to tensor<2x1xf32, #sparse>
-
-    // After rank reducing (by tensor dialect transformation)
-    %t = tensor.extract_slice %input[0, 0,] [2, 1] [1, 1]
-       : tensor<2x3xf32, #sparse> to tensor<2xf32>
-    %s = tensor.expand_shape [[0, 1]] %t
-       : tensor<2xf32> to tensor<2x1xf32, #sparse>
-
-    // After sparsity propagation
-    %t = tensor.extract_slice %input[0, 0,] [2, 1] [1, 1]
-       : tensor<2x3xf32, #sparse> to tensor<2xf32, #sparse1>
-    %s = tensor.expand_shape [[0, 1]] %t
-       : tensor<2xf32, #sparse1> to tensor<2x1xf32, #sparse>
-    ```
-  }];
-
-  let constructor = "mlir::createSparseEncodingPropagationPass()";
-  let dependentDialects = [
-    "sparse_tensor::SparseTensorDialect",
-    "tensor::TensorDialect",
-  ];
-}
-
 def SparseReinterpretMap : Pass<"sparse-reinterpret-map", "ModuleOp"> {
   let summary = "Reinterprets sparse tensor type mappings";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h
index 48f9066934a25..964c35b3f15b8 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.h
@@ -21,9 +21,6 @@ namespace tensor {
 /// Creates an instance of the `tensor` subset folding pass.
 std::unique_ptr<Pass> createFoldTensorSubsetOpsPass();
 
-/// Creates an instance of the `tensor` dialect bufferization pass.
-std::unique_ptr<Pass> createTensorBufferizePass();
-
 //===----------------------------------------------------------------------===//
 // Registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td
index 4cc3844f29120..be4c333836ec0 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Passes.td
@@ -27,9 +27,4 @@ def FoldTensorSubsetOps : Pass<"fold-tensor-subset-ops"> {
   ];
 }
 
-def TensorBufferize : Pass<"tensor-bufferize", "func::FuncOp"> {
-  let summary = "Bufferize the `tensor` dialect";
-  let constructor = "mlir::tensor::createTensorBufferizePass()";
-}
-
 #endif // MLIR_DIALECT_TENSOR_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h
index fbfc56dfe2cf4..1f9522b51a4cf 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h
@@ -18,6 +18,7 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+class TypeConverter;
 namespace tosa {
 
 #define GEN_PASS_DECL
@@ -38,6 +39,8 @@ void populateTosaConstantReduction(MLIRContext *ctx,
                                    RewritePatternSet &patterns,
                                    bool aggressiveReduceConstant);
 
+void populateTosaTypeConversion(TypeConverter &converter);
+
 std::unique_ptr<Pass> createTosaLayerwiseConstantFoldPass();
 std::unique_ptr<Pass> createTosaLayerwiseConstantFoldPass(
     const TosaLayerwiseConstantFoldPassOptions &options);
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h
index 911402551e14d..5667f4fa95ace 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.h
@@ -17,9 +17,6 @@ namespace vector {
 #define GEN_PASS_DECL
 #include "mlir/Dialect/Vector/Transforms/Passes.h.inc"
 
-/// Creates an instance of the `vector` dialect bufferization pass.
-std::unique_ptr<Pass> createVectorBufferizePass();
-
 /// Creates an instance of the `vector.mask` lowering pass.
 std::unique_ptr<Pass> createLowerVectorMaskPass();
 
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td
index 31a0b3b2f0c53..7436998749791 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/Passes.td
@@ -11,11 +11,6 @@
 
 include "mlir/Pass/PassBase.td"
 
-def VectorBufferize : Pass<"vector-bufferize", "func::FuncOp"> {
-  let summary = "Bufferize Vector dialect ops";
-  let constructor = "mlir::vector::createVectorBufferizePass()";
-}
-
 def LowerVectorMaskPass : Pass<"lower-vector-mask", "func::FuncOp"> {
   let summary = "Lower 'vector.mask' operations";
   let constructor = "mlir::vector::createLowerVectorMaskPass()";
diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
index 05064a72ef02e..0e107e88f5232 100644
--- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
+++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
@@ -105,10 +105,83 @@ class ConstantIntRanges {
 
 raw_ostream &operator<<(raw_ostream &, const ConstantIntRanges &);
 
+/// This lattice value represents the integer range of an SSA value.
+class IntegerValueRange {
+public:
+  /// Create a maximal range ([0, uint_max(t)] / [int_min(t), int_max(t)])
+  /// range that is used to mark the value as unable to be analyzed further,
+  /// where `t` is the type of `value`.
+  static IntegerValueRange getMaxRange(Value value);
+
+  /// Create an integer value range lattice value.
+  IntegerValueRange(ConstantIntRanges value) : value(std::move(value)) {}
+
+  /// Create an integer value range lattice value.
+  IntegerValueRange(std::optional<ConstantIntRanges> value = std::nullopt)
+      : value(std::move(value)) {}
+
+  /// Whether the range is uninitialized. This happens when the state hasn't
+  /// been set during the analysis.
+  bool isUninitialized() const { return !value.has_value(); }
+
+  /// Get the known integer value range.
+  const ConstantIntRanges &getValue() const {
+    assert(!isUninitialized());
+    return *value;
+  }
+
+  /// Compare two ranges.
+  bool operator==(const IntegerValueRange &rhs) const {
+    return value == rhs.value;
+  }
+
+  /// Compute the least upper bound of two ranges.
+  static IntegerValueRange join(const IntegerValueRange &lhs,
+                                const IntegerValueRange &rhs) {
+    if (lhs.isUninitialized())
+      return rhs;
+    if (rhs.isUninitialized())
+      return lhs;
+    return IntegerValueRange{lhs.getValue().rangeUnion(rhs.getValue())};
+  }
+
+  /// Print the integer value range.
+  void print(raw_ostream &os) const { os << value; }
+
+private:
+  /// The known integer value range.
+  std::optional<ConstantIntRanges> value;
+};
+
+raw_ostream &operator<<(raw_ostream &, const IntegerValueRange &);
+
 /// The type of the `setResultRanges` callback provided to ops implementing
 /// InferIntRangeInterface. It should be called once for each integer result
 /// value and be passed the ConstantIntRanges corresponding to that value.
-using SetIntRangeFn = function_ref<void(Value, const ConstantIntRanges &)>;
+using SetIntRangeFn =
+    llvm::function_ref<void(Value, const ConstantIntRanges &)>;
+
+/// Similar to SetIntRangeFn, but operating on IntegerValueRange lattice values.
+/// This is the `setResultRanges` callback for the IntegerValueRange based
+/// interface method.
+using SetIntLatticeFn =
+    llvm::function_ref<void(Value, const IntegerValueRange &)>;
+
+class InferIntRangeInterface;
+
+namespace intrange::detail {
+/// Default implementation of `inferResultRanges` which dispatches to the
+/// `inferResultRangesFromOptional`.
+void defaultInferResultRanges(InferIntRangeInterface interface,
+                              ArrayRef<IntegerValueRange> argRanges,
+                              SetIntLatticeFn setResultRanges);
+
+/// Default implementation of `inferResultRangesFromOptional` which dispatches
+/// to the `inferResultRanges`.
+void defaultInferResultRangesFromOptional(InferIntRangeInterface interface,
+                                          ArrayRef<ConstantIntRanges> argRanges,
+                                          SetIntRangeFn setResultRanges);
+} // end namespace intrange::detail
 } // end namespace mlir
 
 #include "mlir/Interfaces/InferIntRangeInterface.h.inc"
diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.td b/mlir/include/mlir/Interfaces/InferIntRangeInterface.td
index dbdc526c6f10b..6ee436ce4d6c2 100644
--- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.td
+++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.td
@@ -28,9 +28,10 @@ def InferIntRangeInterface : OpInterface<"InferIntRangeInterface"> {
       Infer the bounds on the results of this op given the bounds on its arguments.
       For each result value or block argument (that isn't a branch argument,
       since the dataflow analysis handles those case), the method should call
-      `setValueRange` with that `Value` as an argument. When `setValueRange`
-      is not called for some value, it will recieve a default value of the mimimum
-      and maximum values for its type (the unbounded range).
+      `setValueRange` with that `Value` as an argument. When implemented,
+      `setValueRange` should be called on all result values for the operation.
+      When operations take non-integer inputs, the
+     `inferResultRangesFromOptional` method should be implemented instead.
 
       When called on an op that also implements the RegionBranchOpInterface
       or BranchOpInterface, this method should not attempt to infer the values
@@ -39,14 +40,39 @@ def InferIntRangeInterface : OpInterface<"InferIntRangeInterface"> {
 
       This function will only be called when at least one result of the op is a
       scalar integer value or the op has a region.
+    }],
+    /*retTy=*/"void",
+    /*methodName=*/"inferResultRanges",
+    /*args=*/(ins "::llvm::ArrayRef<::mlir::ConstantIntRanges>":$argRanges,
+                  "::mlir::SetIntRangeFn":$setResultRanges),
+    /*methodBody=*/"",
+    /*defaultImplementation=*/[{
+      ::mlir::intrange::detail::defaultInferResultRangesFromOptional($_op,
+                                                                     argRanges,
+                                                                     setResultRanges);
+    }]>,
+
+    InterfaceMethod<[{
+      Infer the bounds on the results of this op given the lattice representation
+      of the bounds for its arguments. For each result value or block argument
+      (that isn't a branch argument, since the dataflow analysis handles
+      those case), the method should call `setValueRange` with that `Value`
+      as an argument. When implemented, `setValueRange` should be called on
+      all result values for the operation.
 
-      `argRanges` contains one `IntRangeAttrs` for each argument to the op in ODS
-       order. Non-integer arguments will have the an unbounded range of width-0
-       APInts in their `argRanges` element.
+      This method allows for more precise implementations when operations
+      want to reason about inputs which may be undefined during the analysis.
     }],
-    "void", "inferResultRanges", (ins
-      "::llvm::ArrayRef<::mlir::ConstantIntRanges>":$argRanges,
-      "::mlir::SetIntRangeFn":$setResultRanges)
-  >];
+    /*retTy=*/"void",
+    /*methodName=*/"inferResultRangesFromOptional",
+    /*args=*/(ins "::llvm::ArrayRef<::mlir::IntegerValueRange>":$argRanges,
+                  "::mlir::SetIntLatticeFn":$setResultRanges),
+    /*methodBody=*/"",
+    /*defaultImplementation=*/[{
+      ::mlir::intrange::detail::defaultInferResultRanges($_op,
+                                                         argRanges,
+                                                         setResultRanges);
+    }]>
+  ];
 }
 #endif // MLIR_INTERFACES_INFERINTRANGEINTERFACE
diff --git a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
index 851bb534bc7ee..3988a8826498a 100644
--- a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
+++ b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
@@ -25,7 +25,11 @@ namespace intrange {
 /// abstracted away here to permit writing the function that handles both
 /// 64- and 32-bit index types.
 using InferRangeFn =
-    function_ref<ConstantIntRanges(ArrayRef<ConstantIntRanges>)>;
+    std::function<ConstantIntRanges(ArrayRef<ConstantIntRanges>)>;
+
+/// Function that performs inferrence on an array of `IntegerValueRange`.
+using InferIntegerValueRangeFn =
+    std::function<IntegerValueRange(ArrayRef<IntegerValueRange>)>;
 
 static constexpr unsigned indexMinWidth = 32;
 static constexpr unsigned indexMaxWidth = 64;
@@ -52,7 +56,7 @@ using InferRangeWithOvfFlagsFn =
 ///
 /// The `mode` argument specifies if the unsigned, signed, or both results of
 /// the inference computation should be used when comparing the results.
-ConstantIntRanges inferIndexOp(InferRangeFn inferFn,
+ConstantIntRanges inferIndexOp(const InferRangeFn &inferFn,
                                ArrayRef<ConstantIntRanges> argRanges,
                                CmpMode mode);
 
diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
index a82c30717e275..9721620807a0f 100644
--- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
@@ -36,17 +36,6 @@
 using namespace mlir;
 using namespace mlir::dataflow;
 
-IntegerValueRange IntegerValueRange::getMaxRange(Value value) {
-  unsigned width = ConstantIntRanges::getStorageBitwidth(value.getType());
-  if (width == 0)
-    return {};
-  APInt umin = APInt::getMinValue(width);
-  APInt umax = APInt::getMaxValue(width);
-  APInt smin = width != 0 ? APInt::getSignedMinValue(width) : umin;
-  APInt smax = width != 0 ? APInt::getSignedMaxValue(width) : umax;
-  return IntegerValueRange{ConstantIntRanges{umin, umax, smin, smax}};
-}
-
 void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const {
   Lattice::onUpdate(solver);
 
@@ -72,24 +61,17 @@ void IntegerValueRangeLattice::onUpdate(DataFlowSolver *solver) const {
 void IntegerRangeAnalysis::visitOperation(
     Operation *op, ArrayRef<const IntegerValueRangeLattice *> operands,
     ArrayRef<IntegerValueRangeLattice *> results) {
-  // If the lattice on any operand is unitialized, bail out.
-  if (llvm::any_of(operands, [](const IntegerValueRangeLattice *lattice) {
-        return lattice->getValue().isUninitialized();
-      })) {
-    return;
-  }
-
   auto inferrable = dyn_cast<InferIntRangeInterface>(op);
   if (!inferrable)
     return setAllToEntryStates(results);
 
   LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n");
-  SmallVector<ConstantIntRanges> argRanges(
-      llvm::map_range(operands, [](const IntegerValueRangeLattice *val) {
-        return val->getValue().getValue();
-      }));
+  auto argRanges = llvm::map_to_vector(
+      operands, [](const IntegerValueRangeLattice *lattice) {
+        return lattice->getValue();
+      });
 
-  auto joinCallback = [&](Value v, const ConstantIntRanges &attrs) {
+  auto joinCallback = [&](Value v, const IntegerValueRange &attrs) {
     auto result = dyn_cast<OpResult>(v);
     if (!result)
       return;
@@ -99,7 +81,7 @@ void IntegerRangeAnalysis::visitOperation(
     IntegerValueRangeLattice *lattice = results[result.getResultNumber()];
     IntegerValueRange oldRange = lattice->getValue();
 
-    ChangeResult changed = lattice->join(IntegerValueRange{attrs});
+    ChangeResult changed = lattice->join(attrs);
 
     // Catch loop results with loop variant bounds and conservatively make
     // them [-inf, inf] so we don't circle around infinitely often (because
@@ -116,7 +98,7 @@ void IntegerRangeAnalysis::visitOperation(
     propagateIfChanged(lattice, changed);
   };
 
-  inferrable.inferResultRanges(argRanges, joinCallback);
+  inferrable.inferResultRangesFromOptional(argRanges, joinCallback);
 }
 
 void IntegerRangeAnalysis::visitNonControlFlowArguments(
@@ -124,17 +106,12 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments(
     ArrayRef<IntegerValueRangeLattice *> argLattices, unsigned firstIndex) {
   if (auto inferrable = dyn_cast<InferIntRangeInterface>(op)) {
     LLVM_DEBUG(llvm::dbgs() << "Inferring ranges for " << *op << "\n");
-    // If the lattice on any operand is unitialized, bail out.
-    if (llvm::any_of(op->getOperands(), [&](Value value) {
-          return getLatticeElementFor(op, value)->getValue().isUninitialized();
-        }))
-      return;
-    SmallVector<ConstantIntRanges> argRanges(
-        llvm::map_range(op->getOperands(), [&](Value value) {
-          return getLatticeElementFor(op, value)->getValue().getValue();
-        }));
 
-    auto joinCallback = [&](Value v, const ConstantIntRanges &attrs) {
+    auto argRanges = llvm::map_to_vector(op->getOperands(), [&](Value value) {
+      return getLatticeElementFor(op, value)->getValue();
+    });
+
+    auto joinCallback = [&](Value v, const IntegerValueRange &attrs) {
       auto arg = dyn_cast<BlockArgument>(v);
       if (!arg)
         return;
@@ -145,7 +122,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments(
       IntegerValueRangeLattice *lattice = argLattices[arg.getArgNumber()];
       IntegerValueRange oldRange = lattice->getValue();
 
-      ChangeResult changed = lattice->join(IntegerValueRange{attrs});
+      ChangeResult changed = lattice->join(attrs);
 
       // Catch loop results with loop variant bounds and conservatively make
       // them [-inf, inf] so we don't circle around infinitely often (because
@@ -162,7 +139,7 @@ void IntegerRangeAnalysis::visitNonControlFlowArguments(
       propagateIfChanged(lattice, changed);
     };
 
-    inferrable.inferResultRanges(argRanges, joinCallback);
+    inferrable.inferResultRangesFromOptional(argRanges, joinCallback);
     return;
   }
 
diff --git a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
index 0be3d76f556de..388794ec122d2 100644
--- a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
+++ b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
@@ -394,7 +394,9 @@ void mlir::populateArithToEmitCPatterns(TypeConverter &typeConverter,
     ArithConstantOpConversionPattern,
     ArithOpConversion<arith::AddFOp, emitc::AddOp>,
     ArithOpConversion<arith::DivFOp, emitc::DivOp>,
+    ArithOpConversion<arith::DivSIOp, emitc::DivOp>,
     ArithOpConversion<arith::MulFOp, emitc::MulOp>,
+    ArithOpConversion<arith::RemSIOp, emitc::RemOp>,
     ArithOpConversion<arith::SubFOp, emitc::SubOp>,
     IntegerOpConversion<arith::AddIOp, emitc::AddOp>,
     IntegerOpConversion<arith::MulIOp, emitc::MulOp>,
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index f425b1f59d994..70dcccf0a7307 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -77,9 +77,9 @@ Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
 }
 static constexpr StringLiteral amdgcnDataLayout =
     "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
-    "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
-    "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
-    "G1-ni:7:8";
+    "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:"
+    "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
+    "64-S32-A5-G1-ni:7:8:9";
 
 namespace {
 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
index 89f956a5e7017..c0c015ab34aab 100644
--- a/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
+++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensor.cpp
@@ -224,8 +224,17 @@ class ReshapeConverter : public OpConversionPattern<tosa::ReshapeOp> {
   matchAndRewrite(tosa::ReshapeOp reshape, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const final {
     auto loc = reshape.getLoc();
-    auto resultType = reshape.getResult().getType();
-    auto input = reshape.getInput1();
+    auto resultType = cast_if_present<ShapedType>(
+        getTypeConverter()->convertType(reshape.getType()));
+    if (!resultType) {
+      return rewriter.notifyMatchFailure(reshape.getLoc(),
+                                         "could not convert result type");
+    }
+    auto input = dyn_cast<TypedValue<TensorType>>(adaptor.getInput1());
+    if (!input) {
+      return rewriter.notifyMatchFailure(reshape.getLoc(),
+                                         "expected input type to be tensor");
+    }
     auto newShape = reshape.getNewShape();
 
     // Infer all intermediate types
@@ -288,12 +297,13 @@ class SliceConverter : public OpConversionPattern<tosa::SliceOp> {
   }
 };
 
-class PadConverter : public OpRewritePattern<tosa::PadOp> {
+class PadConverter : public OpConversionPattern<tosa::PadOp> {
 public:
-  using OpRewritePattern<tosa::PadOp>::OpRewritePattern;
+  using OpConversionPattern::OpConversionPattern;
 
-  LogicalResult matchAndRewrite(tosa::PadOp padOp,
-                                PatternRewriter &rewriter) const final {
+  LogicalResult
+  matchAndRewrite(tosa::PadOp padOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
     auto loc = padOp.getLoc();
     auto input = padOp.getInput1();
     auto padding = padOp.getPadding();
@@ -428,11 +438,8 @@ struct ConcatConverter : public OpConversionPattern<tosa::ConcatOp> {
 } // namespace
 
 void mlir::tosa::populateTosaToTensorConversionPatterns(
-    RewritePatternSet *patterns) {
-  patterns->add<
-    ConcatConverter,
-    PadConverter,
-    ReshapeConverter,
-    SliceConverter
-  >(patterns->getContext());
+    TypeConverter &converter, RewritePatternSet *patterns) {
+  patterns
+      ->add<ConcatConverter, PadConverter, ReshapeConverter, SliceConverter>(
+          converter, patterns->getContext());
 }
diff --git a/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp b/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp
index 50dc55667fb94..fa1c2cf7fba98 100644
--- a/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp
+++ b/mlir/lib/Conversion/TosaToTensor/TosaToTensorPass.cpp
@@ -42,7 +42,10 @@ struct TosaToTensor : public impl::TosaToTensorBase<TosaToTensor> {
     target.addLegalDialect<arith::ArithDialect>();
     target.addLegalDialect<tensor::TensorDialect>();
 
-    mlir::tosa::populateTosaToTensorConversionPatterns(&patterns);
+    TypeConverter converter;
+    mlir::tosa::populateTosaTypeConversion(converter);
+
+    mlir::tosa::populateTosaToTensorConversionPatterns(converter, &patterns);
 
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index a0b50251c6b67..5797c5681a5fd 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -2467,6 +2467,12 @@ TypedAttr mlir::arith::getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
                            : APFloat::getInf(semantic, /*Negative=*/true);
     return builder.getFloatAttr(resultType, identity);
   }
+  case AtomicRMWKind::maxnumf: {
+    const llvm::fltSemantics &semantic =
+        llvm::cast<FloatType>(resultType).getFloatSemantics();
+    APFloat identity = APFloat::getNaN(semantic, /*Negative=*/true);
+    return builder.getFloatAttr(resultType, identity);
+  }
   case AtomicRMWKind::addf:
   case AtomicRMWKind::addi:
   case AtomicRMWKind::maxu:
@@ -2489,6 +2495,12 @@ TypedAttr mlir::arith::getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
 
     return builder.getFloatAttr(resultType, identity);
   }
+  case AtomicRMWKind::minnumf: {
+    const llvm::fltSemantics &semantic =
+        llvm::cast<FloatType>(resultType).getFloatSemantics();
+    APFloat identity = APFloat::getNaN(semantic, /*Negative=*/false);
+    return builder.getFloatAttr(resultType, identity);
+  }
   case AtomicRMWKind::mins:
     return builder.getIntegerAttr(
         resultType, APInt::getSignedMaxValue(
@@ -2518,6 +2530,8 @@ std::optional<TypedAttr> mlir::arith::getNeutralElement(Operation *op) {
           .Case([](arith::MulFOp op) { return AtomicRMWKind::mulf; })
           .Case([](arith::MaximumFOp op) { return AtomicRMWKind::maximumf; })
           .Case([](arith::MinimumFOp op) { return AtomicRMWKind::minimumf; })
+          .Case([](arith::MaxNumFOp op) { return AtomicRMWKind::maxnumf; })
+          .Case([](arith::MinNumFOp op) { return AtomicRMWKind::minnumf; })
           // Integer operations.
           .Case([](arith::AddIOp op) { return AtomicRMWKind::addi; })
           .Case([](arith::OrIOp op) { return AtomicRMWKind::ori; })
diff --git a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
index fbe2ecab8adca..462044417b5fb 100644
--- a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
@@ -295,18 +295,24 @@ void arith::CmpIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 // SelectOp
 //===----------------------------------------------------------------------===//
 
-void arith::SelectOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
-                                        SetIntRangeFn setResultRange) {
-  std::optional<APInt> mbCondVal = argRanges[0].getConstantValue();
+void arith::SelectOp::inferResultRangesFromOptional(
+    ArrayRef<IntegerValueRange> argRanges, SetIntLatticeFn setResultRange) {
+  std::optional<APInt> mbCondVal =
+      argRanges[0].isUninitialized()
+          ? std::nullopt
+          : argRanges[0].getValue().getConstantValue();
+
+  const IntegerValueRange &trueCase = argRanges[1];
+  const IntegerValueRange &falseCase = argRanges[2];
 
   if (mbCondVal) {
     if (mbCondVal->isZero())
-      setResultRange(getResult(), argRanges[2]);
+      setResultRange(getResult(), falseCase);
     else
-      setResultRange(getResult(), argRanges[1]);
+      setResultRange(getResult(), trueCase);
     return;
   }
-  setResultRange(getResult(), argRanges[1].rangeUnion(argRanges[2]));
+  setResultRange(getResult(), IntegerValueRange::join(trueCase, falseCase));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp
deleted file mode 100644
index 9a066756f429c..0000000000000
--- a/mlir/lib/Dialect/Arith/Transforms/Bufferize.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//===- Bufferize.cpp - Bufferization for Arith ops ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Arith/Transforms/Passes.h"
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-
-namespace mlir {
-namespace arith {
-#define GEN_PASS_DEF_ARITHBUFFERIZEPASS
-#include "mlir/Dialect/Arith/Transforms/Passes.h.inc"
-} // namespace arith
-} // namespace mlir
-
-using namespace mlir;
-using namespace bufferization;
-
-namespace {
-/// Pass to bufferize Arith ops.
-struct ArithBufferizePass
-    : public arith::impl::ArithBufferizePassBase<ArithBufferizePass> {
-  using ArithBufferizePassBase::ArithBufferizePassBase;
-
-  ArithBufferizePass(uint64_t alignment = 0, bool constantOpOnly = false)
-      : constantOpOnly(constantOpOnly) {
-    this->alignment = alignment;
-  }
-
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    if (constantOpOnly) {
-      options.opFilter.allowOperation<arith::ConstantOp>();
-    } else {
-      options.opFilter.allowDialect<arith::ArithDialect>();
-    }
-    options.bufferAlignment = alignment;
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                    arith::ArithDialect>();
-    arith::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-
-private:
-  bool constantOpOnly;
-};
-} // namespace
-
-std::unique_ptr<Pass>
-mlir::arith::createConstantBufferizePass(uint64_t alignment) {
-  return std::make_unique<ArithBufferizePass>(alignment,
-                                              /*constantOpOnly=*/true);
-}
diff --git a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
index 12659eaba1fa5..6b8bde8dc2aaf 100644
--- a/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Arith/Transforms/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_mlir_dialect_library(MLIRArithTransforms
   BufferDeallocationOpInterfaceImpl.cpp
   BufferizableOpInterfaceImpl.cpp
-  Bufferize.cpp
   BufferViewFlowOpInterfaceImpl.cpp
   EmulateUnsupportedFloats.cpp
   EmulateWideInt.cpp
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 7ba347a1f15e4..0fddd60eb8140 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -320,29 +320,6 @@ struct OneShotBufferizePass
 };
 } // namespace
 
-namespace {
-struct BufferizationBufferizePass
-    : public bufferization::impl::BufferizationBufferizeBase<
-          BufferizationBufferizePass> {
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    options.opFilter.allowDialect<BufferizationDialect>();
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<bufferization::BufferizationDialect, memref::MemRefDialect>();
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> mlir::bufferization::createBufferizationBufferizePass() {
-  return std::make_unique<BufferizationBufferizePass>();
-}
-
 std::unique_ptr<Pass> mlir::bufferization::createOneShotBufferizePass() {
   return std::make_unique<OneShotBufferizePass>();
 }
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index dcf3f3b52a606..60b911948d4a0 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -2555,6 +2555,24 @@ Region *LLVMFuncOp::getCallableRegion() {
   return &getBody();
 }
 
+//===----------------------------------------------------------------------===//
+// UndefOp.
+//===----------------------------------------------------------------------===//
+
+/// Fold an undef operation to a dedicated undef attribute.
+OpFoldResult LLVM::UndefOp::fold(FoldAdaptor) {
+  return LLVM::UndefAttr::get(getContext());
+}
+
+//===----------------------------------------------------------------------===//
+// PoisonOp.
+//===----------------------------------------------------------------------===//
+
+/// Fold a poison operation to a dedicated poison attribute.
+OpFoldResult LLVM::PoisonOp::fold(FoldAdaptor) {
+  return LLVM::PoisonAttr::get(getContext());
+}
+
 //===----------------------------------------------------------------------===//
 // ZeroOp.
 //===----------------------------------------------------------------------===//
@@ -2568,6 +2586,15 @@ LogicalResult LLVM::ZeroOp::verify() {
   return success();
 }
 
+/// Fold a zero operation to a builtin zero attribute when possible and fall
+/// back to a dedicated zero attribute.
+OpFoldResult LLVM::ZeroOp::fold(FoldAdaptor) {
+  OpFoldResult result = Builder(getContext()).getZeroAttr(getType());
+  if (result)
+    return result;
+  return LLVM::ZeroAttr::get(getContext());
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp.
 //===----------------------------------------------------------------------===//
@@ -3271,11 +3298,18 @@ LogicalResult LLVMDialect::verifyRegionResultAttribute(Operation *op,
 
 Operation *LLVMDialect::materializeConstant(OpBuilder &builder, Attribute value,
                                             Type type, Location loc) {
-  // If this was folded from an llvm.mlir.addressof operation, it should be
-  // materialized as such.
+  // If this was folded from an operation other than llvm.mlir.constant, it
+  // should be materialized as such. Note that an llvm.mlir.zero may fold into
+  // a builtin zero attribute and thus will materialize as a llvm.mlir.constant.
   if (auto symbol = dyn_cast<FlatSymbolRefAttr>(value))
     if (isa<LLVM::LLVMPointerType>(type))
       return builder.create<LLVM::AddressOfOp>(loc, type, symbol);
+  if (isa<LLVM::UndefAttr>(value))
+    return builder.create<LLVM::UndefOp>(loc, type);
+  if (isa<LLVM::PoisonAttr>(value))
+    return builder.create<LLVM::PoisonOp>(loc, type);
+  if (isa<LLVM::ZeroAttr>(value))
+    return builder.create<LLVM::ZeroOp>(loc, type);
   // Otherwise try materializing it as a regular llvm.mlir.constant op.
   return LLVM::ConstantOp::materialize(builder, value, type, loc);
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
deleted file mode 100644
index 8812ca14ba610..0000000000000
--- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===- Bufferize.cpp - Bufferization of linalg ops ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Linalg/Passes.h"
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/BuiltinDialect.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-#define GEN_PASS_DEF_LINALGBUFFERIZEPASS
-#include "mlir/Dialect/Linalg/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-using namespace bufferization;
-
-namespace {
-/// Converts Linalg operations that work on tensor-type operands or results to
-/// work on buffers.
-struct LinalgBufferizePass
-    : public impl::LinalgBufferizePassBase<LinalgBufferizePass> {
-  using impl::LinalgBufferizePassBase<
-      LinalgBufferizePass>::LinalgBufferizePassBase;
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    options.opFilter.allowDialect<linalg::LinalgDialect>();
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                    tensor::TensorDialect, linalg::LinalgDialect>();
-    linalg::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-};
-} // namespace
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index ed9f40089282a..7e3dc56e0acdc 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -2,7 +2,6 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   AllInterfaces.cpp
   BubbleUpExtractSlice.cpp
   BufferizableOpInterfaceImpl.cpp
-  Bufferize.cpp
   ConstantFold.cpp
   ConvertToDestinationStyle.cpp
   ConvertConv2DToImg2Col.cpp
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp
index 8fffabf11f3fd..2e6079e1402e1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp
@@ -23,21 +23,21 @@ using namespace mlir;
 using namespace mlir::linalg;
 
 namespace {
-/// Base class for constant folding linalg.generic ops with N inputs, 1 output,
-/// and permutation indexing maps.
+/// Base class for constant folding linalg structured ops with N inputs, 1
+/// output, and permutation indexing maps.
 ///
 /// `ConcreteType` should provide methods with signatures
 ///
 /// ```c++
-///   bool matchIndexingMaps(GenericOp genericOp) const;
-///   RegionComputationFn getRegionComputeFn(GenericOp) const;
+///   bool matchIndexingMaps(LinalgOp linalgOp) const;
+///   RegionComputationFn getRegionComputeFn(LinalgOp) const;
 /// ```
 ///
 /// The latter inspects the region and returns the computation inside as a
 /// functor. The functor will be invoked with constant elements for all inputs
 /// and should return the corresponding computed constant element for output.
 template <typename ConcreteType>
-class FoldConstantBase : public OpRewritePattern<GenericOp> {
+class FoldConstantBase : public OpInterfaceRewritePattern<LinalgOp> {
 public:
   struct APIntOrFloat {
     std::optional<APInt> apInt;
@@ -52,25 +52,26 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
 
   FoldConstantBase(MLIRContext *context, const ControlFusionFn &controlFn,
                    PatternBenefit benefit = 1)
-      : OpRewritePattern<GenericOp>(context, benefit), controlFn(controlFn) {}
+      : OpInterfaceRewritePattern<LinalgOp>(context, benefit),
+        controlFn(controlFn) {}
 
-  LogicalResult matchAndRewrite(GenericOp genericOp,
+  LogicalResult matchAndRewrite(LinalgOp linalgOp,
                                 PatternRewriter &rewriter) const override {
     // Mixed and buffer sematics aren't supported.
-    if (!genericOp.hasPureTensorSemantics())
+    if (!linalgOp.hasPureTensorSemantics())
       return failure();
 
     // Only support ops generating one output for now.
-    if (genericOp.getNumDpsInits() != 1)
+    if (linalgOp.getNumDpsInits() != 1)
       return failure();
 
-    auto outputType = dyn_cast<ShapedType>(genericOp.getResultTypes().front());
+    auto outputType = dyn_cast<ShapedType>(linalgOp->getResultTypes().front());
     // Require the output types to be static given that we are generating
     // constants.
     if (!outputType || !outputType.hasStaticShape())
       return failure();
 
-    if (!llvm::all_of(genericOp.getInputs(), [](Value input) {
+    if (!llvm::all_of(linalgOp.getDpsInputs(), [](Value input) {
           return isa<ShapedType>(input.getType());
         }))
       return failure();
@@ -80,7 +81,7 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
       return cast<ShapedType>(value.getType()).getElementType();
     };
     if (!llvm::all_equal(
-            llvm::map_range(genericOp->getOperands(), getOperandElementType)))
+            llvm::map_range(linalgOp->getOperands(), getOperandElementType)))
       return failure();
 
     // We can only handle the case where we have int/float elements.
@@ -93,30 +94,30 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
     // entirely in the compiler, without needing to turn all indices into
     // Values, and then do affine apply on them, and then match back the
     // constant again.
-    if (!llvm::all_of(genericOp.getIndexingMapsArray(),
+    if (!llvm::all_of(linalgOp.getIndexingMapsArray(),
                       [](AffineMap map) { return map.isPermutation(); }))
       return failure();
 
-    for (OpOperand &operand : genericOp.getDpsInitsMutable()) {
-      if (genericOp.payloadUsesValueFromOperand(&operand))
+    for (OpOperand &operand : linalgOp.getDpsInitsMutable()) {
+      if (linalgOp.payloadUsesValueFromOperand(&operand))
         return failure();
     }
 
     // Further check the indexing maps are okay for the ConcreteType.
-    if (!static_cast<const ConcreteType *>(this)->matchIndexingMaps(genericOp))
+    if (!static_cast<const ConcreteType *>(this)->matchIndexingMaps(linalgOp))
       return failure();
 
     // Defer to the concrete type to check the region and discover the
     // computation inside.
     RegionComputationFn computeFn =
-        static_cast<const ConcreteType *>(this)->getRegionComputeFn(genericOp);
+        static_cast<const ConcreteType *>(this)->getRegionComputeFn(linalgOp);
     if (!computeFn)
       return failure();
 
     // All inputs should be constants.
-    int numInputs = genericOp.getNumDpsInputs();
+    int numInputs = linalgOp.getNumDpsInputs();
     SmallVector<DenseIntOrFPElementsAttr> inputValues(numInputs);
-    for (const auto &en : llvm::enumerate(genericOp.getDpsInputOperands())) {
+    for (const auto &en : llvm::enumerate(linalgOp.getDpsInputOperands())) {
       if (!matchPattern(en.value()->get(),
                         m_Constant(&inputValues[en.index()])))
         return failure();
@@ -124,12 +125,11 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
 
     // Identified this as a potential candidate for folding. Now check the
     // policy to see whether we are allowed to proceed.
-    for (OpOperand *operand : genericOp.getDpsInputOperands()) {
+    for (OpOperand *operand : linalgOp.getDpsInputOperands()) {
       if (!controlFn(operand))
         return failure();
     }
 
-    auto linalgOp = cast<LinalgOp>(genericOp.getOperation());
     SmallVector<int64_t, 4> loopBounds = linalgOp.computeStaticLoopSizes();
     int64_t numElements = outputType.getNumElements();
 
@@ -155,8 +155,8 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
 
     SmallVector<SmallVector<unsigned>> inputDims;
     for (int i = 0; i < numInputs; ++i)
-      inputDims.push_back(getDimPositions(genericOp.getIndexingMapsArray()[i]));
-    auto outputDims = getDimPositions(genericOp.getIndexingMapsArray().back());
+      inputDims.push_back(getDimPositions(linalgOp.getIndexingMapsArray()[i]));
+    auto outputDims = getDimPositions(linalgOp.getIndexingMapsArray().back());
     auto outputShape = outputType.getShape();
 
     // Allocate small vectors for index delinearization. Initial values do not
@@ -173,7 +173,7 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
     APIntOrFloatArray computeFnInputs;
 
     auto inputShapes = llvm::to_vector<4>(
-        llvm::map_range(genericOp.getInputs(), [](Value value) {
+        llvm::map_range(linalgOp.getDpsInputs(), [](Value value) {
           return cast<ShapedType>(value.getType()).getShape();
         }));
 
@@ -254,7 +254,7 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
         isFloat ? DenseElementsAttr::get(outputType, fpOutputValues)
                 : DenseElementsAttr::get(outputType, intOutputValues);
 
-    rewriter.replaceOpWithNewOp<arith::ConstantOp>(genericOp, outputAttr);
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(linalgOp, outputAttr);
     return success();
   }
 
@@ -262,18 +262,20 @@ class FoldConstantBase : public OpRewritePattern<GenericOp> {
   ControlFusionFn controlFn;
 };
 
-// Folds linalg.generic ops that are actually transposes on constant values.
+// Folds linalg.transpose (and linalg.generic ops that are actually transposes)
+// on constant values.
 struct FoldConstantTranspose : public FoldConstantBase<FoldConstantTranspose> {
+
   using FoldConstantBase::FoldConstantBase;
 
-  bool matchIndexingMaps(GenericOp genericOp) const {
+  bool matchIndexingMaps(LinalgOp linalgOp) const {
     // We should have one input and one output.
-    return genericOp.getIndexingMapsArray().size() == 2;
+    return linalgOp.getIndexingMapsArray().size() == 2;
   }
 
-  RegionComputationFn getRegionComputeFn(GenericOp genericOp) const {
+  RegionComputationFn getRegionComputeFn(LinalgOp linalgOp) const {
     // Make sure the region only contains a yield op.
-    Block &body = genericOp.getRegion().front();
+    Block &body = linalgOp->getRegion(0).front();
     if (!llvm::hasSingleElement(body))
       return nullptr;
     auto yieldOp = dyn_cast<linalg::YieldOp>(body.getTerminator());
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index 65efa18af18f6..c0829397f1f85 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -351,7 +351,8 @@ static UnitExtentReplacementInfo dropUnitExtentFromOperandMetadata(
   auto isUnitDim = [&](unsigned dim) {
     if (auto dimExpr = dyn_cast<AffineDimExpr>(exprs[dim])) {
       unsigned oldPosition = dimExpr.getPosition();
-      return !oldDimsToNewDimsMap.count(oldPosition);
+      return !oldDimsToNewDimsMap.count(oldPosition) &&
+             (operandShape[dim] == 1);
     }
     // Handle the other case where the shape is 1, and is accessed using a
     // constant 0.
diff --git a/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp
deleted file mode 100644
index 9dadbdbc91eca..0000000000000
--- a/mlir/lib/Dialect/Shape/Transforms/Bufferize.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//====----- Bufferize.cpp - Bufferization of shape ops  ---------*- C++-*--===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Shape/Transforms/Passes.h"
-
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Shape/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-#define GEN_PASS_DEF_SHAPEBUFFERIZE
-#include "mlir/Dialect/Shape/Transforms/Passes.h.inc"
-} // namespace mlir
-
-using namespace mlir;
-using namespace bufferization;
-
-namespace {
-struct ShapeBufferizePass
-    : public impl::ShapeBufferizeBase<ShapeBufferizePass> {
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    options.opFilter.allowDialect<shape::ShapeDialect>();
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                    shape::ShapeDialect>();
-    shape::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-};
-} // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> mlir::createShapeBufferizePass() {
-  return std::make_unique<ShapeBufferizePass>();
-}
diff --git a/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt
index 7c9b0d2e5e3a8..a51c6780c2866 100644
--- a/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Shape/Transforms/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_mlir_dialect_library(MLIRShapeOpsTransforms
   BufferizableOpInterfaceImpl.cpp
-  Bufferize.cpp
   OutlineShapeComputation.cpp
   RemoveShapeConstraints.cpp
   ShapeToShapeLowering.cpp
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index f57353b5892b5..b42d58634a36c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -23,7 +23,6 @@
 
 namespace mlir {
 #define GEN_PASS_DEF_SPARSEASSEMBLER
-#define GEN_PASS_DEF_SPARSEENCODINGPROPAGATION
 #define GEN_PASS_DEF_SPARSEREINTERPRETMAP
 #define GEN_PASS_DEF_PRESPARSIFICATIONREWRITE
 #define GEN_PASS_DEF_SPARSIFICATIONPASS
@@ -61,14 +60,6 @@ struct SparseAssembler : public impl::SparseAssemblerBase<SparseAssembler> {
   }
 };
 
-struct SparseEncodingPropagation
-    : public impl::SparseEncodingPropagationBase<SparseEncodingPropagation> {
-  SparseEncodingPropagation() = default;
-  SparseEncodingPropagation(const SparseEncodingPropagation &pass) = default;
-
-  void runOnOperation() override {}
-};
-
 struct SparseReinterpretMap
     : public impl::SparseReinterpretMapBase<SparseReinterpretMap> {
   SparseReinterpretMap() = default;
@@ -407,10 +398,6 @@ std::unique_ptr<Pass> mlir::createSparseAssembler() {
   return std::make_unique<SparseAssembler>();
 }
 
-std::unique_ptr<Pass> mlir::createSparseEncodingPropagationPass() {
-  return std::make_unique<SparseEncodingPropagation>();
-}
-
 std::unique_ptr<Pass> mlir::createSparseReinterpretMapPass() {
   return std::make_unique<SparseReinterpretMap>();
 }
diff --git a/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp
deleted file mode 100644
index d27c4576a8b7a..0000000000000
--- a/mlir/lib/Dialect/Tensor/Transforms/Bufferize.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===- Bufferize.cpp - Bufferization for `tensor` dialect ops -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements bufferization of `tensor` dialect ops
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Tensor/Transforms/Passes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-namespace tensor {
-#define GEN_PASS_DEF_TENSORBUFFERIZE
-#include "mlir/Dialect/Tensor/Transforms/Passes.h.inc"
-} // namespace tensor
-} // namespace mlir
-
-using namespace mlir;
-using namespace bufferization;
-
-namespace {
-struct TensorBufferizePass
-    : public tensor::impl::TensorBufferizeBase<TensorBufferizePass> {
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    options.opFilter.allowDialect<tensor::TensorDialect>();
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                tensor::TensorDialect, scf::SCFDialect, arith::ArithDialect>();
-    tensor::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> mlir::tensor::createTensorBufferizePass() {
-  return std::make_unique<TensorBufferizePass>();
-}
diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
index 0aabdaf667b9d..ce32dea09bb0b 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_mlir_dialect_library(MLIRTensorTransforms
   BufferizableOpInterfaceImpl.cpp
-  Bufferize.cpp
   ConcatOpPatterns.cpp
   EmptyOpPatterns.cpp
   ExtractSliceFromReshapeUtils.cpp
diff --git a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
index 0e6510ba1e925..c78a74b874aff 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tosa/Transforms/CMakeLists.txt
@@ -7,6 +7,7 @@ add_mlir_dialect_library(MLIRTosaTransforms
   TosaLayerwiseConstantFoldPass.cpp
   TosaMakeBroadcastable.cpp
   TosaOptionalDecompositions.cpp
+  TosaTypeConverters.cpp
   TosaValidation.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp
new file mode 100644
index 0000000000000..d2650de8cd7f0
--- /dev/null
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaTypeConverters.cpp
@@ -0,0 +1,52 @@
+
+//===- TosaTypeConverters.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Type converters for lowering TOSA to linalg/arith.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Tosa/Transforms/Passes.h"
+
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+void mlir::tosa::populateTosaTypeConversion(TypeConverter &converter) {
+  converter.addConversion([&](Type type) -> std::optional<Type> {
+    if (type.isUnsignedInteger()) {
+      return IntegerType::get(type.getContext(), type.getIntOrFloatBitWidth(),
+                              IntegerType::SignednessSemantics::Signless);
+    }
+    return type;
+  });
+  converter.addConversion([&](TensorType type) -> std::optional<Type> {
+    auto converted = converter.convertType(type.getElementType());
+    if (!converted)
+      return {};
+    return type.clone(converted);
+  });
+  converter.addSourceMaterialization([&](OpBuilder &builder, Type resultType,
+                                         ValueRange inputs,
+                                         Location loc) -> std::optional<Value> {
+    if (inputs.size() != 1)
+      return std::nullopt;
+
+    return builder.create<UnrealizedConversionCastOp>(loc, resultType, inputs)
+        .getResult(0);
+  });
+  converter.addTargetMaterialization([&](OpBuilder &builder, Type resultType,
+                                         ValueRange inputs,
+                                         Location loc) -> std::optional<Value> {
+    if (inputs.size() != 1)
+      return std::nullopt;
+
+    return builder.create<UnrealizedConversionCastOp>(loc, resultType, inputs)
+        .getResult(0);
+  });
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp
deleted file mode 100644
index ee99a99b56109..0000000000000
--- a/mlir/lib/Dialect/Vector/Transforms/Bufferize.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-//===- Bufferize.cpp - Bufferization for `vector` dialect ops -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements bufferization of `vector` dialect ops
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Vector/Transforms/Passes.h"
-
-namespace mlir {
-namespace vector {
-#define GEN_PASS_DEF_VECTORBUFFERIZE
-#include "mlir/Dialect/Vector/Transforms/Passes.h.inc"
-} // namespace vector
-} // namespace mlir
-
-using namespace mlir;
-using namespace bufferization;
-
-namespace {
-struct VectorBufferizePass
-    : public vector::impl::VectorBufferizeBase<VectorBufferizePass> {
-  void runOnOperation() override {
-    BufferizationOptions options = getPartialBufferizationOptions();
-    options.opFilter.allowDialect<vector::VectorDialect>();
-
-    if (failed(bufferizeOp(getOperation(), options)))
-      signalPassFailure();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                    tensor::TensorDialect, vector::VectorDialect>();
-    vector::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> mlir::vector::createVectorBufferizePass() {
-  return std::make_unique<VectorBufferizePass>();
-}
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index c4b6abd3e2361..4dbefdd376a8b 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_mlir_dialect_library(MLIRVectorTransforms
   BufferizableOpInterfaceImpl.cpp
-  Bufferize.cpp
   LowerVectorBroadcast.cpp
   LowerVectorContract.cpp
   LowerVectorGather.cpp
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 802a64b0805ee..156bf742f6297 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -44,6 +44,19 @@ static bool isLessThanTargetBitWidth(Operation *op, unsigned targetBitWidth) {
   return true;
 }
 
+static bool isLessThanOrEqualTargetBitWidth(Type t, unsigned targetBitWidth) {
+  VectorType vecType = dyn_cast<VectorType>(t);
+  // Reject index since getElementTypeBitWidth will abort for Index types.
+  if (!vecType || vecType.getElementType().isIndex())
+    return false;
+  // There are no dimension to fold if it is a 0-D vector.
+  if (vecType.getRank() == 0)
+    return false;
+  unsigned trailingVecDimBitWidth =
+      vecType.getShape().back() * vecType.getElementTypeBitWidth();
+  return trailingVecDimBitWidth <= targetBitWidth;
+}
+
 namespace {
 struct LinearizeConstant final : OpConversionPattern<arith::ConstantOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -355,6 +368,88 @@ struct LinearizeVectorExtract final
     return success();
   }
 
+private:
+  unsigned targetVectorBitWidth;
+};
+
+/// This pattern converts the InsertOp to a ShuffleOp that works on a
+/// linearized vector.
+/// Following,
+///   vector.insert %source %destination [ position ]
+/// is converted to :
+///   %source_1d = vector.shape_cast %source
+///   %destination_1d = vector.shape_cast %destination
+///   %out_1d = vector.shuffle %destination_1d, %source_1d [ shuffle_indices_1d
+///   ] %out_nd = vector.shape_cast %out_1d
+/// `shuffle_indices_1d` is computed using the position of the original insert.
+struct LinearizeVectorInsert final
+    : public OpConversionPattern<vector::InsertOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LinearizeVectorInsert(
+      const TypeConverter &typeConverter, MLIRContext *context,
+      unsigned targetVectBitWidth = std::numeric_limits<unsigned>::max(),
+      PatternBenefit benefit = 1)
+      : OpConversionPattern(typeConverter, context, benefit),
+        targetVectorBitWidth(targetVectBitWidth) {}
+  LogicalResult
+  matchAndRewrite(vector::InsertOp insertOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type dstTy = getTypeConverter()->convertType(insertOp.getDestVectorType());
+    assert(!(insertOp.getDestVectorType().isScalable() ||
+             cast<VectorType>(dstTy).isScalable()) &&
+           "scalable vectors are not supported.");
+
+    if (!isLessThanOrEqualTargetBitWidth(insertOp.getSourceType(),
+                                         targetVectorBitWidth))
+      return rewriter.notifyMatchFailure(
+          insertOp, "Can't flatten since targetBitWidth < OpSize");
+
+    // dynamic position is not supported
+    if (insertOp.hasDynamicPosition())
+      return rewriter.notifyMatchFailure(insertOp,
+                                         "dynamic position is not supported.");
+    auto srcTy = insertOp.getSourceType();
+    auto srcAsVec = dyn_cast<VectorType>(srcTy);
+    uint64_t srcSize = 0;
+    if (srcAsVec) {
+      srcSize = srcAsVec.getNumElements();
+    } else {
+      return rewriter.notifyMatchFailure(insertOp,
+                                         "scalars are not supported.");
+    }
+
+    auto dstShape = insertOp.getDestVectorType().getShape();
+    const auto dstSize = insertOp.getDestVectorType().getNumElements();
+    auto dstSizeForOffsets = dstSize;
+
+    // compute linearized offset
+    int64_t linearizedOffset = 0;
+    auto offsetsNd = insertOp.getStaticPosition();
+    for (auto [dim, offset] : llvm::enumerate(offsetsNd)) {
+      dstSizeForOffsets /= dstShape[dim];
+      linearizedOffset += offset * dstSizeForOffsets;
+    }
+
+    llvm::SmallVector<int64_t, 2> indices(dstSize);
+    auto origValsUntil = indices.begin();
+    std::advance(origValsUntil, linearizedOffset);
+    std::iota(indices.begin(), origValsUntil,
+              0); // original values that remain [0, offset)
+    auto newValsUntil = origValsUntil;
+    std::advance(newValsUntil, srcSize);
+    std::iota(origValsUntil, newValsUntil,
+              dstSize); // new values [offset, offset+srcNumElements)
+    std::iota(newValsUntil, indices.end(),
+              linearizedOffset + srcSize); // the rest of original values
+                                           // [offset+srcNumElements, end)
+
+    rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
+        insertOp, dstTy, adaptor.getDest(), adaptor.getSource(),
+        rewriter.getI64ArrayAttr(indices));
+
+    return success();
+  }
+
 private:
   unsigned targetVectorBitWidth;
 };
@@ -410,6 +505,6 @@ void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns(
                    : true;
       });
   patterns.add<LinearizeVectorShuffle, LinearizeVectorExtract,
-               LinearizeVectorExtractStridedSlice>(
+               LinearizeVectorInsert, LinearizeVectorExtractStridedSlice>(
       typeConverter, patterns.getContext(), targetBitWidth);
 }
diff --git a/mlir/lib/Interfaces/InferIntRangeInterface.cpp b/mlir/lib/Interfaces/InferIntRangeInterface.cpp
index b3f6c0ee3cc32..d879b93586899 100644
--- a/mlir/lib/Interfaces/InferIntRangeInterface.cpp
+++ b/mlir/lib/Interfaces/InferIntRangeInterface.cpp
@@ -126,3 +126,51 @@ raw_ostream &mlir::operator<<(raw_ostream &os, const ConstantIntRanges &range) {
   return os << "unsigned : [" << range.umin() << ", " << range.umax()
             << "] signed : [" << range.smin() << ", " << range.smax() << "]";
 }
+
+IntegerValueRange IntegerValueRange::getMaxRange(Value value) {
+  unsigned width = ConstantIntRanges::getStorageBitwidth(value.getType());
+  if (width == 0)
+    return {};
+
+  APInt umin = APInt::getMinValue(width);
+  APInt umax = APInt::getMaxValue(width);
+  APInt smin = width != 0 ? APInt::getSignedMinValue(width) : umin;
+  APInt smax = width != 0 ? APInt::getSignedMaxValue(width) : umax;
+  return IntegerValueRange{ConstantIntRanges{umin, umax, smin, smax}};
+}
+
+raw_ostream &mlir::operator<<(raw_ostream &os, const IntegerValueRange &range) {
+  range.print(os);
+  return os;
+}
+
+void mlir::intrange::detail::defaultInferResultRanges(
+    InferIntRangeInterface interface, ArrayRef<IntegerValueRange> argRanges,
+    SetIntLatticeFn setResultRanges) {
+  llvm::SmallVector<ConstantIntRanges> unpacked;
+  unpacked.reserve(argRanges.size());
+
+  for (const IntegerValueRange &range : argRanges) {
+    if (range.isUninitialized())
+      return;
+    unpacked.push_back(range.getValue());
+  }
+
+  interface.inferResultRanges(
+      unpacked,
+      [&setResultRanges](Value value, const ConstantIntRanges &argRanges) {
+        setResultRanges(value, IntegerValueRange{argRanges});
+      });
+}
+
+void mlir::intrange::detail::defaultInferResultRangesFromOptional(
+    InferIntRangeInterface interface, ArrayRef<ConstantIntRanges> argRanges,
+    SetIntRangeFn setResultRanges) {
+  auto ranges = llvm::to_vector_of<IntegerValueRange>(argRanges);
+  interface.inferResultRangesFromOptional(
+      ranges,
+      [&setResultRanges](Value value, const IntegerValueRange &argRanges) {
+        if (!argRanges.isUninitialized())
+          setResultRanges(value, argRanges.getValue());
+      });
+}
diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
index fe1a67d628738..5b8d35e7bd519 100644
--- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
+++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
@@ -76,7 +76,7 @@ static ConstantIntRanges minMaxBy(ConstArithFn op, ArrayRef<APInt> lhs,
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferIndexOp(InferRangeFn inferFn,
+mlir::intrange::inferIndexOp(const InferRangeFn &inferFn,
                              ArrayRef<ConstantIntRanges> argRanges,
                              intrange::CmpMode mode) {
   ConstantIntRanges sixtyFour = inferFn(argRanges);
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 7db7163bac4ab..f19e0f8c4c2a4 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -1316,7 +1316,11 @@ LogicalResult CppEmitter::emitOperand(Value value) {
     FailureOr<int> precedence = getOperatorPrecedence(def);
     if (failed(precedence))
       return failure();
-    bool encloseInParenthesis = precedence.value() < getExpressionPrecedence();
+
+    // Sub-expressions with equal or lower precedence need to be parenthesized,
+    // as they might be evaluated in the wrong order depending on the shape of
+    // the expression tree.
+    bool encloseInParenthesis = precedence.value() <= getExpressionPrecedence();
     if (encloseInParenthesis) {
       os << "(";
       pushExpressionPrecedence(lowestPrecedence());
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index d73428a0f4df3..43410aaa6af1b 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -958,6 +958,41 @@ def conv_2d_ngchw_gfchw(
     ) * TypeFn.cast_signed(U, K[D.g, D.fg, D.c, D.kh, D.kw])
 
 
+@linalg_structured_op
+def conv_2d_ngchw_gfchw_q(
+    I=TensorDef(
+        T1, S.N, S.G, S.C, S.OH * S.SH + S.KH * S.DH, S.OW * S.SW + S.KW * S.DW
+    ),
+    K=TensorDef(T2, S.G, S.FG, S.C, S.KH, S.KW),
+    IZp=ScalarDef(I32),
+    KZp=ScalarDef(I32),
+    O=TensorDef(U, S.N, S.G, S.FG, S.OH, S.OW, output=True),
+    strides=IndexAttrDef(S.SH, S.SW, default=[1, 1]),
+    dilations=IndexAttrDef(S.DH, S.DW, default=[1, 1]),
+):
+    """Performs 2-D grouped convolution with zero-point offsets.
+
+    Layout:
+      * Input: NGCHW.
+      * Kernel: GFCHW.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output. This includes the zero
+    point offsets common to quantized operations.
+    """
+    implements(ConvolutionOpInterface)
+    domain(D.n, D.g, D.fg, D.oh, D.ow, D.c, D.kh, D.kw)
+    O[D.n, D.g, D.fg, D.oh, D.ow] += (
+        TypeFn.cast_signed(
+            U, I[D.n, D.g, D.c, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW]
+        )
+        - TypeFn.cast_signed(U, IZp)
+    ) * (
+        TypeFn.cast_signed(U, K[D.g, D.fg, D.c, D.kh, D.kw])
+        - TypeFn.cast_signed(U, KZp)
+    )
+
+
 @linalg_structured_op
 def conv_3d_ndhwc_dhwcf(
     I=TensorDef(
diff --git a/mlir/python/mlir/dialects/scf.py b/mlir/python/mlir/dialects/scf.py
index dad7377987e56..7025f6e0f1a16 100644
--- a/mlir/python/mlir/dialects/scf.py
+++ b/mlir/python/mlir/dialects/scf.py
@@ -132,8 +132,8 @@ def for_(
     iter_args = tuple(for_op.inner_iter_args)
     with InsertionPoint(for_op.body):
         if len(iter_args) > 1:
-            yield iv, iter_args
+            yield iv, iter_args, for_op.results
         elif len(iter_args) == 1:
-            yield iv, iter_args[0]
+            yield iv, iter_args[0], for_op.results[0]
         else:
             yield iv
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
index b453b69a214e8..dac3fd99b607c 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
@@ -88,6 +88,17 @@ func.func @arith_index(%arg0: index, %arg1: index) {
 
 // -----
 
+// CHECK-LABEL: arith_signed_integer_div_rem
+func.func @arith_signed_integer_div_rem(%arg0: i32, %arg1: i32) {
+  // CHECK: emitc.div %arg0, %arg1 : (i32, i32) -> i32
+  %0 = arith.divsi %arg0, %arg1 : i32
+  // CHECK: emitc.rem %arg0, %arg1 : (i32, i32) -> i32
+  %1 = arith.remsi %arg0, %arg1 : i32
+  return
+}
+
+// -----
+
 func.func @arith_select(%arg0: i1, %arg1: tensor<8xi32>, %arg2: tensor<8xi32>) -> () {
   // CHECK: [[V0:[^ ]*]] = emitc.conditional %arg0, %arg1, %arg2 : tensor<8xi32>
   %0 = arith.select %arg0, %arg1, %arg2 : i1, tensor<8xi32>
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 8a2d8bd7967ca..a8d61a6a0f6fd 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -2,7 +2,8 @@
 // RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
 // CHECK-LABEL: @test_module
-// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_index_ops()
   // CHECK32-LABEL: func @gpu_index_ops()
diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
index 72e7e4cc84088..1e62e25176a00 100644
--- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
+++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
@@ -420,6 +420,20 @@ func.func @test_reshape_6d_down_s2s_explicit(%arg0: tensor<1x2x3x5x7x11xf32>) ->
 
 // -----
 
+// CHECK-LABEL: @test_reshape_samerank_unsigned
+//  CHECK-SAME: (%[[ARG0:.*]]: tensor<3x2xui8>)
+func.func @test_reshape_samerank_unsigned(%arg0: tensor<3x2xui8>) -> tensor<2x3xui8> {
+  // CHECK-NEXT: %[[CAST1:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : tensor<3x2xui8> to tensor<3x2xi8>
+  // CHECK-NEXT: %[[RESHAPE1:.*]] = tensor.collapse_shape %[[CAST1]] {{\[}}[0, 1]] : tensor<3x2xi8> into tensor<6xi8>
+  // CHECK-NEXT: %[[RESHAPE2:.*]] = tensor.expand_shape %[[RESHAPE1]] {{\[}}[0, 1]] output_shape {{\[}}2, 3] : tensor<6xi8> into tensor<2x3xi8>
+  // CHECK-NEXT: %[[CAST2:.*]] = builtin.unrealized_conversion_cast %[[RESHAPE2]] : tensor<2x3xi8> to tensor<2x3xui8
+  %0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, 3>} : (tensor<3x2xui8>) -> tensor<2x3xui8>
+  // CHECK-NEXT: return %[[CAST2]]
+  return %0 : tensor<2x3xui8>
+}
+
+// -----
+
 // CHECK-LABEL: func @slice
 func.func @slice(%arg0: tensor<6xf32>) ->() {
   // CHECK: [[SLICE:%.+]] = tensor.extract_slice %arg0[2] [1] [1]
diff --git a/mlir/test/Dialect/Arith/bufferize.mlir b/mlir/test/Dialect/Arith/bufferize.mlir
index 944954e9e4edd..a3b1454fb68f6 100644
--- a/mlir/test/Dialect/Arith/bufferize.mlir
+++ b/mlir/test/Dialect/Arith/bufferize.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s -arith-bufferize -split-input-file -verify-diagnostics | FileCheck %s
-// RUN: mlir-opt %s -arith-bufferize=alignment=64 -split-input-file -verify-diagnostics | FileCheck --check-prefix=ALIGNED %s
+// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=arith,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -split-input-file -verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL:   func @index_cast(
 // CHECK-SAME:  %[[TENSOR:.*]]: tensor<i32>, %[[SCALAR:.*]]: i32
@@ -22,10 +21,7 @@ func.func @index_cast(%tensor: tensor<i32>, %scalar: i32) -> (tensor<index>, ind
 // The name isn't load-bearing though.
 
 // CHECK: memref.global "private" constant @__constant_3x4xf32 : memref<3x4xf32> = dense<7.000000e+00>
-// CHECK-NOT: alignment
-
-// ALIGNED: memref.global "private" constant @__constant_3x4xf32 : memref<3x4xf32> = dense<7.000000e+00>
-// ALIGNED-SAME: {alignment = 64 : i64}
+// CHECK-SAME: {alignment = 64 : i64}
 
 // CHECK: @basic
 func.func @basic() -> tensor<3x4xf32> {
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 1a387c20c4b29..e4f95bb0545a2 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2950,14 +2950,6 @@ func.func @unsignedExtendConstantResource() -> tensor<i16> {
   return %ext : tensor<i16>
 }
 
-// Just checks that this doesn't crash.
-// CHECK-LABEL: @signedExtendSplatAsDynamicShape
-func.func @signedExtendSplatAsDynamicShape() -> tensor<?xi64> {
-  %splat = arith.constant dense<5> : tensor<2xi16>
-  %extsplat = arith.extsi %splat : tensor<2xi16> to tensor<?xi64>
-  return %extsplat : tensor<?xi64>
-}
-
 // CHECK-LABEL: @extsi_i0
 //       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i16
 //       CHECK:   return %[[ZERO]] : i16
diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir
index 5b538197a0c11..60f0ab41afa48 100644
--- a/mlir/test/Dialect/Arith/int-range-interface.mlir
+++ b/mlir/test/Dialect/Arith/int-range-interface.mlir
@@ -899,3 +899,22 @@ func.func @test_shl_i8_nowrap() -> i8 {
   %2 = test.reflect_bounds %1 : i8
   return %2: i8
 }
+
+/// A test case to ensure that the ranges for unsupported ops are initialized
+/// properly to maxRange, rather than left uninitialized.
+/// In this test case, the previous behavior would leave the ranges for %a and
+/// %b uninitialized, resulting in arith.cmpf's range not being updated, even
+/// though it has an integer valued result.
+
+// CHECK-LABEL: func @test_cmpf_propagates
+// CHECK: test.reflect_bounds {smax = 2 : index, smin = 1 : index, umax = 2 : index, umin = 1 : index}
+func.func @test_cmpf_propagates(%a: f32, %b: f32) -> index {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %0 = arith.cmpf ueq, %a, %b : f32
+  %1 = arith.select %0, %c1, %c2 : index
+  %2 = test.reflect_bounds %1 : index
+  func.return %2 : index
+}
+
diff --git a/mlir/test/Dialect/Arith/invalid.mlir b/mlir/test/Dialect/Arith/invalid.mlir
index ada849220bb83..652aa738ad392 100644
--- a/mlir/test/Dialect/Arith/invalid.mlir
+++ b/mlir/test/Dialect/Arith/invalid.mlir
@@ -1,13 +1,21 @@
 // RUN: mlir-opt -split-input-file %s -verify-diagnostics
 
 func.func @test_index_cast_shape_error(%arg0 : tensor<index>) -> tensor<2xi64> {
-  // expected-error @+1 {{'arith.index_cast' op requires the same shape for all operands and results}}
+  // expected-error @+1 {{'arith.index_cast' op failed to verify that input and output have the same tensor dimensions}}
   %0 = arith.index_cast %arg0 : tensor<index> to tensor<2xi64>
   return %0 : tensor<2xi64>
 }
 
 // -----
 
+func.func @test_index_cast_shape_dim_error(%arg0 : tensor<2xindex>) -> tensor<?xi64> {
+  // expected-error @+1 {{'arith.index_cast' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.index_cast %arg0 : tensor<2xindex> to tensor<?xi64>
+  return %0 : tensor<?xi64>
+}
+
+// -----
+
 func.func @test_index_cast_tensor_error(%arg0 : tensor<index>) -> i64 {
   // expected-error @+1 {{'arith.index_cast' op requires the same shape for all operands and results}}
   %0 = arith.index_cast %arg0 : tensor<index> to i64
@@ -655,6 +663,14 @@ func.func @extsi_scalable_to_fl(%arg0 : vector<[4]xi32>) {
 
 // -----
 
+func.func @extsi_tensor_dim(%arg0 : tensor<4xi32>) {
+  // expected-error@+1 {{'arith.extsi' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.extsi %arg0 : tensor<4xi32> to tensor<?xi64>
+  return
+}
+
+// -----
+
 func.func @extf_scalable_to_fl(%arg0 : vector<[4]xf32>) {
   // expected-error@+1 {{'arith.extf' op requires the same shape for all operands and results}}
   %0 = arith.extf %arg0 : vector<[4]xf32> to vector<4xf64>
@@ -703,6 +719,22 @@ func.func @bitcast_scalable_to_fl(%arg0 : vector<[4]xf32>) {
 
 // -----
 
+func.func @bitcast_tensor_dim(%arg0 : tensor<4xf32>) {
+  // expected-error@+1 {{'arith.bitcast' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.bitcast %arg0 : tensor<4xf32> to tensor<?xi32>
+  return
+}
+
+// -----
+
+func.func @bitcast_tensor_dim(%arg0 : tensor<?xf32>) {
+  // expected-error@+1 {{'arith.bitcast' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.bitcast %arg0 : tensor<?xf32> to tensor<4xi32>
+  return
+}
+
+// -----
+
 func.func @trunci_fl_to_scalable(%arg0 : vector<4xi32>) {
   // expected-error@+1 {{'arith.trunci' op requires the same shape for all operands and results}}
   %0 = arith.trunci %arg0 : vector<4xi32> to vector<[4]xi8>
@@ -719,6 +751,14 @@ func.func @truncf_fl_to_scalable(%arg0 : vector<4xf64>) {
 
 // -----
 
+func.func @truncf_tensor_dim(%arg0 : tensor<4xf64>) {
+  // expected-error@+1 {{'arith.truncf' op failed to verify that input and output have the same tensor dimensions}}
+  %0 = arith.truncf %arg0 : tensor<4xf64> to tensor<?xf32>
+  return
+}
+
+// -----
+
 func.func @extui_fl_to_scalable(%arg0 : vector<4xi32>) {
   // expected-error@+1 {{'arith.extui' op requires the same shape for all operands and results}}
   %0 = arith.extui %arg0 : vector<4xi32> to vector<[4]xi64>
diff --git a/mlir/test/Dialect/GPU/dynamic-shared-memory.mlir b/mlir/test/Dialect/GPU/dynamic-shared-memory.mlir
index fb45faaa712f7..d73125fd763e6 100644
--- a/mlir/test/Dialect/GPU/dynamic-shared-memory.mlir
+++ b/mlir/test/Dialect/GPU/dynamic-shared-memory.mlir
@@ -3,11 +3,11 @@
 gpu.module @modules {
   // CHECK: llvm.mlir.global internal @__dynamic_shmem__3() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
   llvm.mlir.global internal @__dynamic_shmem__0() {addr_space = 3 : i32, alignment = 4 : i64} : !llvm.array<0 x i8>
-  llvm.mlir.global internal @__dynamic_shmem__1() {addr_space = 3 : i32, alignment = 4 : i64} : !llvm.array<0 x i8>  
-  llvm.mlir.global internal @__dynamic_shmem__2() {alignment = 16 : i64} : !llvm.array<0 x i8>  
+  llvm.mlir.global internal @__dynamic_shmem__1() {addr_space = 3 : i32, alignment = 4 : i64} : !llvm.array<0 x i8>
+  llvm.mlir.global internal @__dynamic_shmem__2() {alignment = 16 : i64} : !llvm.array<0 x i8>
   // CHECK-LABEL: llvm.func @dynamic_shared_memory_kernel(
   // CHECK-SAME: %[[arg0:.+]]: i64)
-  gpu.func @dynamic_shared_memory_kernel(%d : index) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>} {    
+  gpu.func @dynamic_shared_memory_kernel(%d : index) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>} {
     %c1 = arith.constant 1 : index
     %c8192 = arith.constant 8192 : index
     %c16384 = arith.constant 16384 : index
@@ -19,83 +19,83 @@ gpu.module @modules {
 
     %1 = memref.view %shmem[%c16384][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<32x64xf32, #gpu.address_space<workgroup>>
     "test.use.shared.memory"(%1) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
-    
-// CHECK: %[[S0:.+]] = llvm.mlir.constant(32 : index) : i64
-// CHECK: %[[S1:.+]] = llvm.mlir.constant(64 : index) : i64
-// CHECK: %[[S2:.+]] = llvm.mlir.constant(1 : index) : i64
-// CHECK: %[[S3:.+]] = llvm.mlir.constant(0 : index) : i64
-// CHECK: %[[S4:.+]] = llvm.mlir.addressof @__dynamic_shmem__3 : !llvm.ptr<3>
-// CHECK: %[[S5:.+]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
-// CHECK: %[[S6:.+]] = llvm.insertvalue %[[S4]], %[[S5]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S7:.+]] = llvm.getelementptr %[[S4]][8192] : (!llvm.ptr<3>) -> !llvm.ptr<3>, i8
-// CHECK: %[[S8:.+]] = llvm.insertvalue %[[S7]], %[[S6]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S9:.+]] = llvm.insertvalue %[[S3]], %[[S8]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S10:.+]] = llvm.insertvalue %[[S1]], %[[S9]][3, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S11:.+]] = llvm.insertvalue %[[S2]], %[[S10]][4, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S12:.+]] = llvm.insertvalue %[[S0]], %[[S11]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S13:.+]] = llvm.insertvalue %[[S1]], %[[S12]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S14:.+]] = builtin.unrealized_conversion_cast %[[S13]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> to memref<32x64xf32, #gpu.address_space<workgroup>>
-// CHECK: "test.use.shared.memory"(%[[S14]]) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
-// CHECK: %[[S15:.+]] = llvm.getelementptr %4[16384] : (!llvm.ptr<3>) -> !llvm.ptr<3>, i8
-// CHECK: %[[S16:.+]] = llvm.insertvalue %[[S15]], %[[S6]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S17:.+]] = llvm.insertvalue %[[S3]], %[[S16]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S18:.+]] = llvm.insertvalue %[[S1]], %[[S17]][3, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S19:.+]] = llvm.insertvalue %[[S2]], %[[S18]][4, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S20:.+]] = llvm.insertvalue %[[S0]], %[[S19]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S21:.+]] = llvm.insertvalue %[[S1]], %[[S20]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S22:.+]] = builtin.unrealized_conversion_cast %[[S21]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> to memref<32x64xf32, #gpu.address_space<workgroup>>
-// CHECK: "test.use.shared.memory"(%[[S22]]) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
+
+// CHECK-DAG: %[[S0:.+]] = llvm.mlir.constant(32 : index) : i64
+// CHECK-DAG: %[[S1:.+]] = llvm.mlir.constant(64 : index) : i64
+// CHECK-DAG: %[[S2:.+]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+// CHECK-DAG: %[[S3:.+]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-DAG: %[[S4:.+]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-DAG: %[[S5:.+]] = llvm.mlir.addressof @__dynamic_shmem__3 : !llvm.ptr<3>
+//     CHECK: %[[S6:.+]] = llvm.insertvalue %[[S5]], %[[S2]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S7:.+]] = llvm.getelementptr %[[S5]][8192] : (!llvm.ptr<3>) -> !llvm.ptr<3>, i8
+//     CHECK: %[[S8:.+]] = llvm.insertvalue %[[S7]], %[[S6]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S9:.+]] = llvm.insertvalue %[[S4]], %[[S8]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S10:.+]] = llvm.insertvalue %[[S1]], %[[S9]][3, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S11:.+]] = llvm.insertvalue %[[S3]], %[[S10]][4, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S12:.+]] = llvm.insertvalue %[[S0]], %[[S11]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S13:.+]] = llvm.insertvalue %[[S1]], %[[S12]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S14:.+]] = builtin.unrealized_conversion_cast %[[S13]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> to memref<32x64xf32, #gpu.address_space<workgroup>>
+//     CHECK: "test.use.shared.memory"(%[[S14]]) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
+//     CHECK: %[[S15:.+]] = llvm.getelementptr %[[S5]][16384] : (!llvm.ptr<3>) -> !llvm.ptr<3>, i8
+//     CHECK: %[[S16:.+]] = llvm.insertvalue %[[S15]], %[[S6]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S17:.+]] = llvm.insertvalue %[[S4]], %[[S16]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S18:.+]] = llvm.insertvalue %[[S1]], %[[S17]][3, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S19:.+]] = llvm.insertvalue %[[S3]], %[[S18]][4, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S20:.+]] = llvm.insertvalue %[[S0]], %[[S19]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S21:.+]] = llvm.insertvalue %[[S1]], %[[S20]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S22:.+]] = builtin.unrealized_conversion_cast %[[S21]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> to memref<32x64xf32, #gpu.address_space<workgroup>>
+//     CHECK: "test.use.shared.memory"(%[[S22]]) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
     gpu.return
   }
 
 // CHECK-LABEL: llvm.func @gpu_device_function
-  gpu.func @gpu_device_function()  {    
+  gpu.func @gpu_device_function()  {
     %c8192 = arith.constant 8192 : index
     %shmem = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
     %0 = memref.view %shmem[%c8192][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<32x64xf32, #gpu.address_space<workgroup>>
     "test.use.shared.memory"(%0) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
-// CHECK: %[[S0:.+]] = llvm.mlir.constant(32 : index) : i64
-// CHECK: %[[S1:.+]] = llvm.mlir.constant(64 : index) : i64
-// CHECK: %[[S2:.+]] = llvm.mlir.constant(1 : index) : i64
-// CHECK: %[[S3:.+]] = llvm.mlir.constant(0 : index) : i64
-// CHECK: %[[S4:.+]] = llvm.mlir.addressof @__dynamic_shmem__3 : !llvm.ptr<3>
-// CHECK: %[[S5:.+]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
-// CHECK: %[[S6:.+]] = llvm.insertvalue %[[S4]], %[[S5]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S7:.+]] = llvm.getelementptr %[[S4]][8192] : (!llvm.ptr<3>) -> !llvm.ptr<3>, i8
-// CHECK: %[[S8:.+]] = llvm.insertvalue %[[S7]], %[[S6]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S9:.+]] = llvm.insertvalue %[[S3]], %[[S8]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S10:.+]] = llvm.insertvalue %[[S1]], %[[S9]][3, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S11:.+]] = llvm.insertvalue %[[S2]], %[[S10]][4, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S12:.+]] = llvm.insertvalue %[[S0]], %[[S11]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S13:.+]] = llvm.insertvalue %[[S1]], %[[S12]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S14:.+]] = builtin.unrealized_conversion_cast %13 : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> to memref<32x64xf32, #gpu.address_space<workgroup>>
-// CHECK: "test.use.shared.memory"(%[[S14]]) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
+// CHECK-DAG: %[[S0:.+]] = llvm.mlir.constant(32 : index) : i64
+// CHECK-DAG: %[[S1:.+]] = llvm.mlir.constant(64 : index) : i64
+// CHECK-DAG: %[[S2:.+]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+// CHECK-DAG: %[[S3:.+]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-DAG: %[[S4:.+]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-DAG: %[[S5:.+]] = llvm.mlir.addressof @__dynamic_shmem__3 : !llvm.ptr<3>
+//     CHECK: %[[S6:.+]] = llvm.insertvalue %[[S5]], %[[S2]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S7:.+]] = llvm.getelementptr %[[S5]][8192] : (!llvm.ptr<3>) -> !llvm.ptr<3>, i8
+//     CHECK: %[[S8:.+]] = llvm.insertvalue %[[S7]], %[[S6]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S9:.+]] = llvm.insertvalue %[[S4]], %[[S8]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S10:.+]] = llvm.insertvalue %[[S1]], %[[S9]][3, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S11:.+]] = llvm.insertvalue %[[S3]], %[[S10]][4, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S12:.+]] = llvm.insertvalue %[[S0]], %[[S11]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S13:.+]] = llvm.insertvalue %[[S1]], %[[S12]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S14:.+]] = builtin.unrealized_conversion_cast %13 : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> to memref<32x64xf32, #gpu.address_space<workgroup>>
+//     CHECK: "test.use.shared.memory"(%[[S14]]) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
 
     gpu.return
   }
 
 // CHECK-LABEL: llvm.func @func_device_function
-  func.func @func_device_function()  {    
+  func.func @func_device_function()  {
     %c8192 = arith.constant 8192 : index
     %shmem = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
     %0 = memref.view %shmem[%c8192][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<32x64xf32, #gpu.address_space<workgroup>>
     "test.use.shared.memory"(%0) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
-// CHECK: %[[S0:.+]] = llvm.mlir.constant(32 : index) : i64
-// CHECK: %[[S1:.+]] = llvm.mlir.constant(64 : index) : i64
-// CHECK: %[[S2:.+]] = llvm.mlir.constant(1 : index) : i64
-// CHECK: %[[S3:.+]] = llvm.mlir.constant(0 : index) : i64
-// CHECK: %[[S4:.+]] = llvm.mlir.addressof @__dynamic_shmem__3 : !llvm.ptr<3>
-// CHECK: %[[S5:.+]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
-// CHECK: %[[S6:.+]] = llvm.insertvalue %[[S4]], %[[S5]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S7:.+]] = llvm.getelementptr %[[S4]][8192] : (!llvm.ptr<3>) -> !llvm.ptr<3>, i8
-// CHECK: %[[S8:.+]] = llvm.insertvalue %[[S7]], %[[S6]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S9:.+]] = llvm.insertvalue %[[S3]], %[[S8]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S10:.+]] = llvm.insertvalue %[[S1]], %[[S9]][3, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S11:.+]] = llvm.insertvalue %[[S2]], %[[S10]][4, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S12:.+]] = llvm.insertvalue %[[S0]], %[[S11]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S13:.+]] = llvm.insertvalue %[[S1]], %[[S12]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> 
-// CHECK: %[[S14:.+]] = builtin.unrealized_conversion_cast %13 : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> to memref<32x64xf32, #gpu.address_space<workgroup>>
-// CHECK: "test.use.shared.memory"(%[[S14]]) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
+// CHECK-DAG: %[[S0:.+]] = llvm.mlir.constant(32 : index) : i64
+// CHECK-DAG: %[[S1:.+]] = llvm.mlir.constant(64 : index) : i64
+// CHECK-DAG: %[[S2:.+]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+// CHECK-DAG: %[[S3:.+]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-DAG: %[[S4:.+]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-DAG: %[[S5:.+]] = llvm.mlir.addressof @__dynamic_shmem__3 : !llvm.ptr<3>
+//     CHECK: %[[S6:.+]] = llvm.insertvalue %[[S5]], %[[S2]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S7:.+]] = llvm.getelementptr %[[S5]][8192] : (!llvm.ptr<3>) -> !llvm.ptr<3>, i8
+//     CHECK: %[[S8:.+]] = llvm.insertvalue %[[S7]], %[[S6]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S9:.+]] = llvm.insertvalue %[[S4]], %[[S8]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S10:.+]] = llvm.insertvalue %[[S1]], %[[S9]][3, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S11:.+]] = llvm.insertvalue %[[S3]], %[[S10]][4, 1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S12:.+]] = llvm.insertvalue %[[S0]], %[[S11]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S13:.+]] = llvm.insertvalue %[[S1]], %[[S12]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+//     CHECK: %[[S14:.+]] = builtin.unrealized_conversion_cast %13 : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> to memref<32x64xf32, #gpu.address_space<workgroup>>
+//     CHECK: "test.use.shared.memory"(%[[S14]]) : (memref<32x64xf32, #gpu.address_space<workgroup>>) -> ()
 
     func.return
   }
diff --git a/mlir/test/Dialect/LLVMIR/constant-folding.mlir b/mlir/test/Dialect/LLVMIR/constant-folding.mlir
index 454126321eb97..497d679a12a09 100644
--- a/mlir/test/Dialect/LLVMIR/constant-folding.mlir
+++ b/mlir/test/Dialect/LLVMIR/constant-folding.mlir
@@ -101,3 +101,71 @@ llvm.func @addressof_blocks(%arg: i1) -> !llvm.ptr {
 }
 
 llvm.mlir.global constant @foo() : i32
+
+// -----
+
+// CHECK-LABEL: llvm.func @undef
+llvm.func @undef() {
+  // CHECK-NEXT: %[[UNDEF:.+]] = llvm.mlir.undef : i32
+  %undef1 = llvm.mlir.undef : i32
+  %undef2 = llvm.mlir.undef : i32
+  // CHECK-NEXT: llvm.call @foo(%[[UNDEF]], %[[UNDEF]])
+  llvm.call @foo(%undef1, %undef2) : (i32, i32) -> ()
+  // CHECK-NEXT: llvm.return
+  llvm.return
+}
+
+llvm.func @foo(i32, i32)
+
+// -----
+
+// CHECK-LABEL: llvm.func @poison
+llvm.func @poison() {
+  // CHECK-NEXT: %[[POISON:.+]] = llvm.mlir.poison : i32
+  %poison1 = llvm.mlir.poison : i32
+  %poison2 = llvm.mlir.poison : i32
+  // CHECK-NEXT: llvm.call @foo(%[[POISON]], %[[POISON]])
+  llvm.call @foo(%poison1, %poison2) : (i32, i32) -> ()
+  // CHECK-NEXT: llvm.return
+  llvm.return
+}
+
+llvm.func @foo(i32, i32)
+
+// -----
+
+llvm.func @foo(!llvm.ptr, !llvm.ptr)
+
+// CHECK-LABEL: llvm.func @null_pointer
+llvm.func @null_pointer() {
+  // CHECK-NEXT: %[[NULLPTR:.+]] = llvm.mlir.zero : !llvm.ptr
+  %nullptr1 = llvm.mlir.zero : !llvm.ptr
+  %nullptr2 = llvm.mlir.zero : !llvm.ptr
+  // CHECK-NEXT: llvm.call @foo(%[[NULLPTR]], %[[NULLPTR]])
+  llvm.call @foo(%nullptr1, %nullptr2) : (!llvm.ptr, !llvm.ptr) -> ()
+  // CHECK-NEXT: llvm.return
+  llvm.return
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @zero_integer
+llvm.func @zero_integer() -> i64 {
+  // CHECK-NEXT: %[[ZERO:.+]] = llvm.mlir.constant(0 : i64) : i64
+  %zero = llvm.mlir.zero : i32
+  %zero_extended = llvm.zext %zero : i32 to i64
+  // CHECK-NEXT: llvm.return %[[ZERO]]
+  llvm.return %zero_extended : i64
+}
+
+// -----
+
+// CHECK-LABEL: llvm.func @null_pointer_select
+llvm.func @null_pointer_select(%cond: i1) -> !llvm.ptr {
+  // CHECK-NEXT: %[[NULLPTR:.+]] = llvm.mlir.zero : !llvm.ptr
+  %nullptr1 = llvm.mlir.zero : !llvm.ptr
+  %nullptr2 = llvm.mlir.zero : !llvm.ptr
+  %result = arith.select %cond, %nullptr1, %nullptr2 : !llvm.ptr
+  // CHECK-NEXT: llvm.return %[[NULLPTR]]
+  llvm.return %result : !llvm.ptr
+}
diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
index 29f27e6838e66..e8ab1184b1fd2 100644
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -linalg-bufferize -canonicalize -cse -split-input-file %s | FileCheck %s
+// RUN: mlir-opt --one-shot-bufferize="dialect-filter=linalg,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -canonicalize -cse -split-input-file %s | FileCheck %s
 
 #map0 = affine_map<(d0) -> (d0)>
 
@@ -189,31 +189,3 @@ func.func @bufferize_dot(%in: tensor<4xf32>, %out: tensor<f32>) -> tensor<f32> {
   // CHECK: %[[OUT_TENSOR:.*]] = bufferization.to_tensor %[[ALLOC]] : memref<f32>
   // CHECK: return %[[OUT_TENSOR]]
 }
-
-// -----
-
-// This is a regression test. The linalg-bufferize pass should ignore all func
-// dialect ops.
-
-// CHECK-LABEL: func private @csum(tensor<6xi64>) -> tensor<6xi64>
-func.func private @csum(%arg0: tensor<6xi64>) -> tensor<6xi64>
-
-// CHECK: func public @main(%[[arg0:.*]]: tensor<2x3xi1>)
-// CHECK:   %[[collapse:.*]] = tensor.collapse_shape %[[arg0]]
-// CHECK:   %[[collapse_m:.*]] = bufferization.to_memref %[[collapse]]
-// CHECK:   %[[alloc:.*]] = memref.alloc()
-// CHECK:   linalg.generic {{.*}} ins(%[[collapse_m]] : memref<6xi1>) outs(%[[alloc]] : memref<6xi64>)
-// CHECK:   %[[generic_t:.*]] = bufferization.to_tensor %[[alloc]]
-// CHECK:   %[[call:.*]] = call @csum(%[[generic_t]])
-// CHECK:   return %[[call]]
-func.func public @main(%arg0: tensor<2x3xi1>) -> tensor<6xi64> {
-  %0 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<2x3xi1> into tensor<6xi1>
-  %1 = tensor.empty() : tensor<6xi64>
-  %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<6xi1>) outs(%1 : tensor<6xi64>) {
-  ^bb0(%arg1: i1, %arg2: i64):
-    %4 = arith.extui %arg1 : i1 to i64
-    linalg.yield %4 : i64
-  } -> tensor<6xi64>
-  %3 = func.call @csum(%2) : (tensor<6xi64>) -> tensor<6xi64>
-  return %3 : tensor<6xi64>
-}
diff --git a/mlir/test/Dialect/Linalg/constant-fold.mlir b/mlir/test/Dialect/Linalg/constant-fold.mlir
new file mode 100644
index 0000000000000..3929c26a3382f
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/constant-fold.mlir
@@ -0,0 +1,148 @@
+// RUN: mlir-opt %s -linalg-fuse-elementwise-ops -split-input-file | FileCheck %s
+
+// CHECK-LABEL: @transpose_fold_2d_fp32
+func.func @transpose_fold_2d_fp32(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
+  //               CHECK: %[[CST:.+]] = arith.constant
+  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32>
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    linalg.yield %arg1 : f32
+  } -> tensor<3x2xf32>
+  // CHECK: return %[[CST]]
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_fold_2d_fp64
+func.func @transpose_fold_2d_fp64(%init: tensor<3x2xf64>) -> tensor<3x2xf64> {
+  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf64>
+  //               CHECK: %[[CST:.+]] = arith.constant
+  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf64>
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<2x3xf64>) outs(%init : tensor<3x2xf64>) {
+  ^bb0(%arg1: f64, %arg2: f64):
+    linalg.yield %arg1 : f64
+  } -> tensor<3x2xf64>
+  // CHECK: return %[[CST]]
+  return %1 : tensor<3x2xf64>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_fold_4d_i32
+func.func @transpose_fold_4d_i32(%init: tensor<3x1x4x2xi32>) -> tensor<3x1x4x2xi32> {
+  %input = arith.constant dense<[[
+    [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
+    [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
+  ]]> : tensor<1x2x3x4xi32>
+  //               CHECK: %[[CST:.+]] = arith.constant dense<[
+  // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
+  // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
+  // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
+  // CHECK-SAME{LITERAL}: ]>
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>],
+    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
+  } ins(%input : tensor<1x2x3x4xi32>) outs(%init : tensor<3x1x4x2xi32>) {
+  ^bb0(%arg1: i32, %arg2: i32):
+    linalg.yield %arg1 : i32
+  } -> tensor<3x1x4x2xi32>
+  // CHECK: return %[[CST]]
+  return %1 : tensor<3x1x4x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_fold_4d_i16
+func.func @transpose_fold_4d_i16(%init: tensor<3x1x4x2xi16>) -> tensor<3x1x4x2xi16> {
+  %input = arith.constant dense<[[
+    [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
+    [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
+  ]]> : tensor<1x2x3x4xi16>
+  //               CHECK: %[[CST:.+]] = arith.constant dense<[
+  // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
+  // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
+  // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
+  // CHECK-SAME{LITERAL}: ]>
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>],
+    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
+  } ins(%input : tensor<1x2x3x4xi16>) outs(%init : tensor<3x1x4x2xi16>) {
+  ^bb0(%arg1: i16, %arg2: i16):
+    linalg.yield %arg1 : i16
+  } -> tensor<3x1x4x2xi16>
+  // CHECK: return %[[CST]]
+  return %1 : tensor<3x1x4x2xi16>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_nofold_non_cst_input
+func.func @transpose_nofold_non_cst_input(%input: tensor<2x3xf32>, %init: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  // CHECK: linalg.generic
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    linalg.yield %arg1 : f32
+  } -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_nofold_yield_const
+func.func @transpose_nofold_yield_const(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
+  %cst = arith.constant 8.0 : f32
+  // CHECK: linalg.generic
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    linalg.yield %cst : f32
+  } -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @transpose_nofold_multi_ops_in_region
+func.func @transpose_nofold_multi_ops_in_region(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
+  // CHECK: linalg.generic
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %add = arith.addf %arg1, %arg1 : f32
+    linalg.yield %add : f32
+  } -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @named_transpose_fold_2d_fp32
+func.func @named_transpose_fold_2d_fp32(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
+  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
+  //               CHECK: %[[CST:.+]] = arith.constant
+  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32>
+  %1 = linalg.transpose ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) permutation = [1, 0]
+  // CHECK: return %[[CST]]
+  return %1 : tensor<3x2xf32>
+}
+
+// -----
+
+
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index a9cbaaf7fdc48..8f9b12880adcf 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -1087,3 +1087,46 @@ func.func @drop_known_unit_constant_low_high(%arg0: tensor<1x383x128xf32>) -> te
 //       CHECK:   } : tensor<383x128xf32> to tensor<384x128xf32>
 //       CHECK:   tensor.expand_shape %[[PADDED]]
 //  CHECK-SAME:     {{\[}}[0, 1], [2]] output_shape [1, 384, 128] : tensor<384x128xf32> into tensor<1x384x128xf32>
+
+// -----
+
+// CHECK: #[[$MAP0:.+]] = affine_map<()[s0, s1] -> (s0 * s1)>
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (0, d0)>
+// CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> ()>
+
+// CHECK-LABEL: func @drop_unit_dim_corresponding_to_dynamic_dim
+// CHECK-SAME:                    %[[ARG0:.*]]: tensor<1x?x?x1xf32>,
+// CHECK-SAME:                    %[[ARG1:.*]]: index) -> tensor<?x1x61x1xf32> {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
+// CHECK:           %[[VAL_3:.*]] = tensor.collapse_shape %[[ARG0]] {{\[\[}}0, 1], [2, 3]] : tensor<1x?x?x1xf32> into tensor<?x?xf32>
+// CHECK:           %[[VAL_4:.*]] = tensor.empty(%[[ARG1]]) : tensor<?x61xf32>
+// CHECK:           %[[VAL_5:.*]] = affine.apply #[[$MAP0]](){{\[}}%[[ARG1]], %[[VAL_1]]]
+// CHECK:           %[[VAL_6:.*]] = tensor.empty(%[[VAL_5]]) : tensor<?x61xf32>
+// CHECK:           %[[VAL_7:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel"]} ins(%[[VAL_3]], %[[VAL_2]], %[[VAL_4]] : tensor<?x?xf32>, tensor<f32>, tensor<?x61xf32>) outs(%[[VAL_6]] : tensor<?x61xf32>) {
+// CHECK:           ^bb0(%[[VAL_8:.*]]: f32, %[[VAL_9:.*]]: f32, %[[VAL_10:.*]]: f32, %[[VAL_11:.*]]: f32):
+// CHECK:             %[[VAL_12:.*]] = arith.mulf %[[VAL_8]], %[[VAL_9]] : f32
+// CHECK:             %[[VAL_13:.*]] = arith.addf %[[VAL_10]], %[[VAL_12]] : f32
+// CHECK:             linalg.yield %[[VAL_13]] : f32
+// CHECK:           } -> tensor<?x61xf32>
+// CHECK:           %[[VAL_14:.*]] = tensor.expand_shape %[[VAL_7]] {{\[\[}}0, 1], [2, 3]] output_shape {{\[}}%[[VAL_0]], 1, 61, 1] : tensor<?x61xf32> into tensor<?x1x61x1xf32>
+// CHECK:           return %[[VAL_14]] : tensor<?x1x61x1xf32>
+// CHECK:         }
+
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 + d4, d2 + d5, d6)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
+module {
+  func.func @drop_unit_dim_corresponding_to_dynamic_dim(%arg0: tensor<1x?x?x1xf32>, %arg1: index) -> tensor<?x1x61x1xf32> {
+    %cst = arith.constant dense<1.000000e+00> : tensor<1x1x1x1xf32>
+    %0 = tensor.empty(%arg1) : tensor<?x1x61x1xf32>
+    %1 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%arg0, %cst : tensor<1x?x?x1xf32>, tensor<1x1x1x1xf32>) outs(%0 : tensor<?x1x61x1xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %2 = arith.mulf %in, %in_0 : f32
+      %3 = arith.addf %out, %2 : f32
+      linalg.yield %3 : f32
+    } -> tensor<?x1x61x1xf32>
+    return %1 : tensor<?x1x61x1xf32>
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
index 15a4f6cdd3bbe..e45a9fbb1052c 100644
--- a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
@@ -777,139 +777,6 @@ func.func @fuse_scalar_constant(%arg0 : tensor<?x?xf32>) -> (tensor<?x?xf32>, te
 
 // -----
 
-// CHECK-LABEL: @transpose_fold_2d_fp32
-func.func @transpose_fold_2d_fp32(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
-  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
-  //               CHECK: %[[CST:.+]] = arith.constant
-  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32>
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]
-  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<3x2xf32>
-  // CHECK: return %[[CST]]
-  return %1 : tensor<3x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_fold_2d_fp64
-func.func @transpose_fold_2d_fp64(%init: tensor<3x2xf64>) -> tensor<3x2xf64> {
-  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf64>
-  //               CHECK: %[[CST:.+]] = arith.constant
-  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf64>
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]
-  } ins(%input : tensor<2x3xf64>) outs(%init : tensor<3x2xf64>) {
-  ^bb0(%arg1: f64, %arg2: f64):
-    linalg.yield %arg1 : f64
-  } -> tensor<3x2xf64>
-  // CHECK: return %[[CST]]
-  return %1 : tensor<3x2xf64>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_fold_4d_i32
-func.func @transpose_fold_4d_i32(%init: tensor<3x1x4x2xi32>) -> tensor<3x1x4x2xi32> {
-  %input = arith.constant dense<[[
-    [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
-    [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
-  ]]> : tensor<1x2x3x4xi32>
-  //               CHECK: %[[CST:.+]] = arith.constant dense<[
-  // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
-  // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
-  // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
-  // CHECK-SAME{LITERAL}: ]>
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>],
-    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
-  } ins(%input : tensor<1x2x3x4xi32>) outs(%init : tensor<3x1x4x2xi32>) {
-  ^bb0(%arg1: i32, %arg2: i32):
-    linalg.yield %arg1 : i32
-  } -> tensor<3x1x4x2xi32>
-  // CHECK: return %[[CST]]
-  return %1 : tensor<3x1x4x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_fold_4d_i16
-func.func @transpose_fold_4d_i16(%init: tensor<3x1x4x2xi16>) -> tensor<3x1x4x2xi16> {
-  %input = arith.constant dense<[[
-    [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
-    [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
-  ]]> : tensor<1x2x3x4xi16>
-  //               CHECK: %[[CST:.+]] = arith.constant dense<[
-  // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
-  // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
-  // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
-  // CHECK-SAME{LITERAL}: ]>
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>],
-    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
-  } ins(%input : tensor<1x2x3x4xi16>) outs(%init : tensor<3x1x4x2xi16>) {
-  ^bb0(%arg1: i16, %arg2: i16):
-    linalg.yield %arg1 : i16
-  } -> tensor<3x1x4x2xi16>
-  // CHECK: return %[[CST]]
-  return %1 : tensor<3x1x4x2xi16>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_nofold_non_cst_input
-func.func @transpose_nofold_non_cst_input(%input: tensor<2x3xf32>, %init: tensor<3x2xf32>) -> tensor<3x2xf32> {
-  // CHECK: linalg.generic
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]
-  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<3x2xf32>
-  return %1 : tensor<3x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_nofold_yield_const
-func.func @transpose_nofold_yield_const(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
-  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
-  %cst = arith.constant 8.0 : f32
-  // CHECK: linalg.generic
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]
-  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %cst : f32
-  } -> tensor<3x2xf32>
-  return %1 : tensor<3x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @transpose_nofold_multi_ops_in_region
-func.func @transpose_nofold_multi_ops_in_region(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
-  %input = arith.constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
-  // CHECK: linalg.generic
-  %1 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]
-  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %add = arith.addf %arg1, %arg1 : f32
-    linalg.yield %add : f32
-  } -> tensor<3x2xf32>
-  return %1 : tensor<3x2xf32>
-}
-
-// -----
-
 // Fusing the broadcast into a reduction would require to insert extra knowledge
 // about the size of the reduction dimension. As long, as this is not
 // implemented, we check that two linalg operations remain.
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
index 4f43ec2c9e1ce..31fac9b4b4165 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -204,6 +204,37 @@ func.func @conv_1d_ncw_fcw(%input: memref<?x?x?xf32>, %filter: memref<?x?x?xf32>
 
 // -----
 
+func.func @conv_2d_ngchw_gfchw_q(%input: memref<?x?x?x?x?xi8>, %filter: memref<?x?x?x?x?xi8>, %inputzp: i32, %filterzp: i32, %output: memref<?x?x?x?x?xi32>) {
+  linalg.conv_2d_ngchw_gfchw_q {dilations = dense<1> : tensor<2xi64>,
+                                       strides = dense<1> : tensor<2xi64>}
+     ins (%input, %filter, %inputzp, %filterzp: memref<?x?x?x?x?xi8>, memref<?x?x?x?x?xi8>, i32, i32)
+    outs (%output: memref<?x?x?x?x?xi32>)
+  return
+}
+// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d5, d3 + d6, d4 + d7)>
+// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d2, d5, d6, d7)>
+// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> ()>
+// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4)>
+
+// CHECK: func @conv_2d_ngchw_gfchw_q
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]], #[[MAP2]], #[[MAP3]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]}
+// CHECK-SAME: ins(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}} : memref<?x?x?x?x?xi8>, memref<?x?x?x?x?xi8>, i32, i32)
+// CHECK-SAME: outs(%{{.+}} : memref<?x?x?x?x?xi32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: i8, %[[BBARG1:.+]]: i8, %[[BBARG2:.+]]: i32, %[[BBARG3:.+]]: i32, %[[BBARG4:.+]]: i32)
+// CHECK-NEXT:      %[[EXTSI0:.+]] = arith.extsi %[[BBARG0]] : i8 to i32
+// CHECK-NEXT:      %[[SUB0:.+]] = arith.subi %[[EXTSI0]], %[[BBARG2]] : i32
+// CHECK-NEXT:      %[[EXTSI1:.+]] = arith.extsi %[[BBARG1]] : i8 to i32
+// CHECK-NEXT:      %[[SUB1:.+]] = arith.subi %[[EXTSI1]], %[[BBARG3]] : i32
+// CHECK-NEXT:      %[[MUL:.+]] = arith.muli %[[SUB0]], %[[SUB1]] : i32
+// CHECK-NEXT:      %[[ADD:.+]] = arith.addi %[[BBARG4]], %[[MUL]] : i32
+// CHECK-NEXT:      linalg.yield %[[ADD]] : i32
+
+// -----
+
 func.func @generalize_fill(%output: memref<?x?xf32>, %value : f32) {
   linalg.fill ins(%value : f32) outs(%output : memref<?x?xf32>)
   return
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 051054e67edf0..02ecbed232c8b 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -441,6 +441,21 @@ func.func @conv_2d_ngchw_gfchw(%input: tensor<1x5x3x32x32xf32>, %filter: tensor<
 
 // -----
 
+// CHECK-LABEL: func @conv_2d_ngchw_gfchw_q
+func.func @conv_2d_ngchw_gfchw_q(%input: tensor<1x5x3x32x32xi8>, %filter: tensor<5x2x3x3x3xi8>, %inputzp: i32, %filterzp: i32, %init: tensor<1x5x2x30x30xi32>) -> tensor<1x5x2x30x30xi32> {
+  // CHECK:      linalg.conv_2d_ngchw_gfchw_q
+  // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<1x5x3x32x32xi8>, tensor<5x2x3x3x3xi8>, i32, i32)
+  // CHECK-SAME:   outs(%{{.+}} : tensor<1x5x2x30x30xi32>) -> tensor<1x5x2x30x30xi32>
+  %0 = linalg.conv_2d_ngchw_gfchw_q {dilations = dense<1> : tensor<2xi64>,
+                                         strides = dense<1> : tensor<2xi64>}
+     ins (%input, %filter, %inputzp, %filterzp: tensor<1x5x3x32x32xi8>, tensor<5x2x3x3x3xi8>, i32, i32)
+    outs (%init: tensor<1x5x2x30x30xi32>) -> tensor<1x5x2x30x30xi32>
+  return %0 : tensor<1x5x2x30x30xi32>
+}
+// -----
+
 // CHECK-LABEL: func @conv_3d_ndhwc_dhwcf
 func.func @conv_3d_ndhwc_dhwcf(%input: tensor<?x?x?x?x?xf32>, %filter: tensor<?x?x?x?x?xf32>, %init: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32> {
   // CHECK:      %{{.+}} = linalg.conv_3d_ndhwc_dhwcf
diff --git a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
index 31e9fd00cffa0..9849f36285b16 100644
--- a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
@@ -407,3 +407,95 @@ module attributes {transform.with_named_sequence} {
       transform.yield
   }
 }
+
+// -----
+// Checks we use nan as the neutral element for maxnumf op.
+func.func @generic_split_maxnumf(%in: tensor<32xf32>, %out: tensor<f32>) -> tensor<f32> {
+  %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
+                                        affine_map<(d0) -> ()>],
+        iterator_types = ["reduction"]}
+  ins(%in : tensor<32xf32>)
+  outs(%out : tensor<f32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %y = arith.maxnumf %arg1, %arg2 : f32
+    linalg.yield %y : f32
+  } -> tensor<f32>
+  return %r : tensor<f32>
+}
+
+//  CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+//  CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
+//  CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0) -> (d0)>
+//  CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0) -> ()>
+// CHECK-LABEL:  func @generic_split_maxnumf
+//  The float value 0xFFC00000 that is filled into the init tensor represents negative NaN.
+//  CHECK-DAG: %[[ID:.*]] = arith.constant 0xFFC00000 : f32
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] output_shape [8, 4] : tensor<32xf32> into tensor<8x4xf32>
+//  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<4xf32>
+//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32>
+//      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["reduction", "parallel"]}
+// CHECK-SAME:   ins(%[[I1]] : tensor<8x4xf32>) outs(%[[F]] : tensor<4xf32>) {
+//      CHECK:   arith.maxnumf
+//      CHECK:   linalg.yield
+//      CHECK: } -> tensor<4xf32>
+//      CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP3]]], iterator_types = ["reduction"]}
+// CHECK-SAME:   ins(%[[G]] : tensor<4xf32>) outs(%{{.*}} : tensor<f32>) {
+//      CHECK:   arith.maxnumf {{.*}}
+//      CHECK:   linalg.yield
+//      CHECK:  } -> tensor<f32>
+//      CHECK: return %[[R]] : tensor<f32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1:4 = transform.structured.split_reduction %0 { split_factor = 4, insert_split_dimension = 0, inner_parallel}
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      transform.yield
+  }
+}
+
+// -----
+// Checks we use nan as the neutral element for minnumf op.
+func.func @generic_split_minnumf(%in: tensor<32xf32>, %out: tensor<f32>) -> tensor<f32> {
+  %r = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
+                                        affine_map<(d0) -> ()>],
+        iterator_types = ["reduction"]}
+  ins(%in : tensor<32xf32>)
+  outs(%out : tensor<f32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %y = arith.minnumf %arg1, %arg2 : f32
+    linalg.yield %y : f32
+  } -> tensor<f32>
+  return %r : tensor<f32>
+}
+
+//  CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+//  CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
+//  CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0) -> (d0)>
+//  CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0) -> ()>
+// CHECK-LABEL:  func @generic_split_minnumf
+//  The float value 0x7FC00000 that is filled into the init tensor represents positive NaN.
+//  CHECK-DAG: %[[ID:.*]] = arith.constant 0x7FC00000 : f32
+//  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] output_shape [8, 4] : tensor<32xf32> into tensor<8x4xf32>
+//  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<4xf32>
+//      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32>
+//      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["reduction", "parallel"]}
+// CHECK-SAME:   ins(%[[I1]] : tensor<8x4xf32>) outs(%[[F]] : tensor<4xf32>) {
+//      CHECK:   arith.minnumf
+//      CHECK:   linalg.yield
+//      CHECK: } -> tensor<4xf32>
+//      CHECK: %[[R:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP3]]], iterator_types = ["reduction"]}
+// CHECK-SAME:   ins(%[[G]] : tensor<4xf32>) outs(%{{.*}} : tensor<f32>) {
+//      CHECK:   arith.minnumf {{.*}}
+//      CHECK:   linalg.yield
+//      CHECK:  } -> tensor<f32>
+//      CHECK: return %[[R]] : tensor<f32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1:4 = transform.structured.split_reduction %0 { split_factor = 4, insert_split_dimension = 0, inner_parallel}
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Shape/bufferize.mlir b/mlir/test/Dialect/Shape/bufferize.mlir
index 963a5e8bcf578..9f30a052208f0 100644
--- a/mlir/test/Dialect/Shape/bufferize.mlir
+++ b/mlir/test/Dialect/Shape/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file -shape-bufferize <%s | FileCheck %s
+// RUN: mlir-opt -split-input-file --one-shot-bufferize="dialect-filter=shape,bufferization copy-before-write unknown-type-conversion=identity-layout-map allow-unknown-ops" <%s | FileCheck %s
 
 // -----
 
diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir
index f23f6ac4f181e..ff0fb22431d69 100644
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@@ -144,7 +144,7 @@ func.func @sparse_new3d(%arg0: !llvm.ptr) -> tensor<?x?x?xf32, #SparseTensor> {
 //   CHECK-DAG: %[[Iota:.*]] = memref.cast %[[Iota0]] : memref<2xindex> to memref<?xindex>
 //   CHECK-DAG: memref.store %[[I]], %[[Sizes0]][%[[C0]]] : memref<2xindex>
 //   CHECK-DAG: memref.store %[[J]], %[[Sizes0]][%[[C1]]] : memref<2xindex>
-//       CHECK: %[[NP:.*]] = llvm.mlir.zero : !llvm.ptr
+//   CHECK-DAG: %[[NP:.*]] = llvm.mlir.zero : !llvm.ptr
 //       CHECK: %[[T:.*]] = call @newSparseTensor(%[[Sizes]], %[[Sizes]], %[[LvlTypes]], %[[Iota]], %[[Iota]], %{{.*}}, %{{.*}}, %{{.*}}, %[[Empty]], %[[NP]])
 //       CHECK: return %[[T]] : !llvm.ptr
 func.func @sparse_init(%arg0: index, %arg1: index) -> tensor<?x?xf64, #CSR> {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
index 6e8a26762d90f..df3e4b0ed60c7 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
@@ -6,6 +6,7 @@
 // CHECK-SAME:      %[[VAL_0:.*]]: !llvm.ptr,
 // CHECK-SAME:      %[[VAL_1:.*]]: !llvm.ptr) -> !llvm.ptr {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[ZERO:.*]] = llvm.mlir.zero : !llvm.ptr
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : i32
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : i32
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
@@ -27,8 +28,7 @@
 // CHECK:           %[[VAL_17:.*]] = memref.cast %[[VAL_16]] : memref<2xindex> to memref<?xindex>
 // CHECK:           memref.store %[[VAL_5]], %[[VAL_16]]{{\[}}%[[VAL_5]]] : memref<2xindex>
 // CHECK:           memref.store %[[VAL_6]], %[[VAL_16]]{{\[}}%[[VAL_6]]] : memref<2xindex>
-// CHECK:           %[[VAL_18:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:           %[[VAL_19:.*]] = call @newSparseTensor(%[[VAL_15]], %[[VAL_15]], %[[VAL_13]], %[[VAL_17]], %[[VAL_17]], %[[VAL_4]], %[[VAL_4]], %[[VAL_3]], %[[VAL_4]], %[[VAL_18]]) : (memref<?xindex>, memref<?xindex>, memref<?xi64>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr) -> !llvm.ptr
+// CHECK:           %[[VAL_19:.*]] = call @newSparseTensor(%[[VAL_15]], %[[VAL_15]], %[[VAL_13]], %[[VAL_17]], %[[VAL_17]], %[[VAL_4]], %[[VAL_4]], %[[VAL_3]], %[[VAL_4]], %[[ZERO]]) : (memref<?xindex>, memref<?xindex>, memref<?xi64>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr) -> !llvm.ptr
 // CHECK:           %[[VAL_20:.*]] = memref.alloc() : memref<300xf64>
 // CHECK:           %[[VAL_21:.*]] = memref.cast %[[VAL_20]] : memref<300xf64> to memref<?xf64>
 // CHECK:           %[[VAL_22:.*]] = memref.alloc() : memref<300xi1>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
index 6112856fbf293..c27df00785522 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
@@ -4,8 +4,7 @@
 // RUN: FileCheck %s --check-prefix=CHECK-MIR
 //
 // RUN: mlir-opt %s --sparse-reinterpret-map -sparsification --sparse-tensor-conversion --cse \
-// RUN: --func-bufferize --arith-bufferize           \
-// RUN: --tensor-bufferize --finalizing-bufferize |  \
+// RUN: --one-shot-bufferize="copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" | \
 // RUN: FileCheck %s --check-prefix=CHECK-LIR
 
 #CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
index 401da152a8bdb..9fbb9dd0a26d1 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
@@ -4,8 +4,7 @@
 // RUN: FileCheck %s --check-prefix=CHECK-MIR
 //
 // RUN: mlir-opt %s --sparse-reinterpret-map -sparsification --sparse-tensor-conversion --cse \
-// RUN: --func-bufferize --arith-bufferize           \
-// RUN: --tensor-bufferize --finalizing-bufferize |  \
+// RUN: --one-shot-bufferize="copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" | \
 // RUN: FileCheck %s --check-prefix=CHECK-LIR
 
 #CSC = #sparse_tensor.encoding<{
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
index d769876d8ee8e..a827360abb426 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
@@ -4,8 +4,7 @@
 // RUN: FileCheck %s --check-prefix=CHECK-MIR
 //
 // RUN: mlir-opt %s --sparse-reinterpret-map -sparsification --sparse-tensor-conversion --cse \
-// RUN: --func-bufferize --arith-bufferize           \
-// RUN: --tensor-bufferize --finalizing-bufferize |  \
+// RUN: --one-shot-bufferize="copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" | \
 // RUN: FileCheck %s --check-prefix=CHECK-LIR
 
 #CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}>
diff --git a/mlir/test/Dialect/SparseTensor/specifier_to_llvm.mlir b/mlir/test/Dialect/SparseTensor/specifier_to_llvm.mlir
index b647fe0cdeed0..00ff29125fb5f 100644
--- a/mlir/test/Dialect/SparseTensor/specifier_to_llvm.mlir
+++ b/mlir/test/Dialect/SparseTensor/specifier_to_llvm.mlir
@@ -3,12 +3,12 @@
 #CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}>
 
 // CHECK-LABEL:   func.func @sparse_metadata_init() -> !llvm.struct<(array<2 x i64>, array<3 x i64>)> {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_1:.*]] = llvm.mlir.undef : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
-// CHECK:           %[[VAL_2:.*]] = llvm.insertvalue %[[VAL_0]], %[[VAL_1]][1, 0] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
-// CHECK:           %[[VAL_3:.*]] = llvm.insertvalue %[[VAL_0]], %[[VAL_2]][1, 1] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
-// CHECK:           %[[VAL_4:.*]] = llvm.insertvalue %[[VAL_0]], %[[VAL_3]][1, 2] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
-// CHECK:           return %[[VAL_4]] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
+// CHECK-DAG:       %[[STRUCT:.*]] = llvm.mlir.undef : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
+// CHECK-DAG:       %[[CST0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = llvm.insertvalue %[[CST0]], %[[STRUCT]][1, 0] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
+// CHECK:           %[[VAL_2:.*]] = llvm.insertvalue %[[CST0]], %[[VAL_1]][1, 1] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
+// CHECK:           %[[VAL_3:.*]] = llvm.insertvalue %[[CST0]], %[[VAL_2]][1, 2] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
+// CHECK:           return %[[VAL_3]] : !llvm.struct<(array<2 x i64>, array<3 x i64>)>
 // CHECK:         }
 func.func @sparse_metadata_init() -> !sparse_tensor.storage_specifier<#CSR> {
   %0 = sparse_tensor.storage_specifier.init : !sparse_tensor.storage_specifier<#CSR>
diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir
index 4f553adcc500f..e85d9e740adf4 100644
--- a/mlir/test/Dialect/Tensor/bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -tensor-bufferize -cse -split-input-file | FileCheck %s
+// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=tensor,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -cse -split-input-file | FileCheck %s
 
 // CHECK-LABEL:   func @dim(
 // CHECK-SAME:              %[[TENSOR:.*]]: tensor<*xf32>,
diff --git a/mlir/test/Dialect/Vector/bufferize-invalid.mlir b/mlir/test/Dialect/Vector/bufferize-invalid.mlir
index 1ae3e312c868f..bcca50a0fe79a 100644
--- a/mlir/test/Dialect/Vector/bufferize-invalid.mlir
+++ b/mlir/test/Dialect/Vector/bufferize-invalid.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s -vector-bufferize -split-input-file -verify-diagnostics
-// | FileCheck %s
+// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=vector,bufferization copy-before-write unknown-type-conversion=identity-layout-map allow-unknown-ops" -split-input-file -verify-diagnostics
 
 // CHECK-LABEL: func @mask(
 func.func @mask(%t0: tensor<?xf32>, %val: vector<16xf32>, %idx: index, %m0: vector<16xi1>) -> tensor<?xf32> {
diff --git a/mlir/test/Dialect/Vector/bufferize.mlir b/mlir/test/Dialect/Vector/bufferize.mlir
index 6a6a8fa8938bc..3399f60a2c3bf 100644
--- a/mlir/test/Dialect/Vector/bufferize.mlir
+++ b/mlir/test/Dialect/Vector/bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -vector-bufferize -split-input-file | FileCheck %s
+// RUN: mlir-opt %s --one-shot-bufferize="dialect-filter=vector,bufferization copy-before-write unknown-type-conversion=identity-layout-map" -split-input-file | FileCheck %s
 
 // CHECK-LABEL: func @transfer_read(
 //  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[o1:.*]]: index, %[[o2:.*]]: index, %[[pad:.*]]: f32)
diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir
index b29ceab5783d7..31a59b809a74b 100644
--- a/mlir/test/Dialect/Vector/linearize.mlir
+++ b/mlir/test/Dialect/Vector/linearize.mlir
@@ -245,3 +245,32 @@ func.func @test_vector_extract(%arg0: vector<2x8x2xf32>) -> vector<8x2xf32> {
   %0 = vector.extract %arg0[1]: vector<8x2xf32> from vector<2x8x2xf32>
   return %0 : vector<8x2xf32>
 }
+
+// -----
+// ALL-LABEL: test_vector_insert
+// ALL-SAME: (%[[DEST:.*]]: vector<2x8x4xf32>, %[[SRC:.*]]: vector<8x4xf32>) -> vector<2x8x4xf32> {
+func.func @test_vector_insert(%arg0: vector<2x8x4xf32>, %arg1: vector<8x4xf32>) -> vector<2x8x4xf32> {
+  // DEFAULT: %[[ARG_SRC:.*]] = vector.shape_cast %[[SRC]] : vector<8x4xf32> to vector<32xf32>
+  // DEFAULT: %[[ARG_DEST:.*]] = vector.shape_cast %[[DEST]] : vector<2x8x4xf32> to vector<64xf32>
+  // DEFAULT: %[[SHUFFLE:.*]] = vector.shuffle %[[ARG_DEST]], %[[ARG_SRC]]
+  // DEFAULT-SAME: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+  // DEFAULT-SAME: 88, 89, 90, 91, 92, 93, 94, 95, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+  // DEFAULT-SAME: 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<64xf32>, vector<32xf32>
+  // DEFAULT: %[[RES:.*]] = vector.shape_cast %[[SHUFFLE]] : vector<64xf32> to vector<2x8x4xf32>
+  // DEFAULT: return %[[RES]] : vector<2x8x4xf32>
+
+  // BW-128: %[[ARG_SRC:.*]] = vector.shape_cast %[[SRC]] : vector<8x4xf32> to vector<32xf32>
+  // BW-128: %[[ARG_DEST:.*]] = vector.shape_cast %[[DEST]] : vector<2x8x4xf32> to vector<64xf32>
+  // BW-128: %[[SHUFFLE:.*]] = vector.shuffle %[[ARG_DEST]], %[[ARG_SRC]]
+  // BW-128-SAME: [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+  // BW-128-SAME: 88, 89, 90, 91, 92, 93, 94, 95, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+  // BW-128-SAME: 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<64xf32>, vector<32xf32>
+  // BW-128: %[[RES:.*]] = vector.shape_cast %[[SHUFFLE]] : vector<64xf32> to vector<2x8x4xf32>
+  // BW-128: return %[[RES]] : vector<2x8x4xf32>
+
+  // BW-0: %[[RES:.*]] = vector.insert %[[SRC]], %[[DEST]] [0] : vector<8x4xf32> into vector<2x8x4xf32>
+  // BW-0: return %[[RES]] : vector<2x8x4xf32>
+
+  %0 = vector.insert %arg1, %arg0[0]: vector<8x4xf32> into vector<2x8x4xf32>
+  return %0 : vector<2x8x4xf32>
+}
diff --git a/mlir/test/Examples/NVGPU/tools/nvdsl.py b/mlir/test/Examples/NVGPU/tools/nvdsl.py
index 600cae5b47eee..90dbb2355e1c8 100644
--- a/mlir/test/Examples/NVGPU/tools/nvdsl.py
+++ b/mlir/test/Examples/NVGPU/tools/nvdsl.py
@@ -431,7 +431,7 @@ def __str__(self):
                 # saveIR(module)
 
                 # Verify the module
-                # module.operation.verify()
+                module.operation.verify()
 
                 # Compile and JIT MLIR module
                 options = f"cubin-chip=sm_90a cubin-features=+ptx80 opt-level=3"
diff --git a/mlir/test/Examples/transform/ChH/full.mlir b/mlir/test/Examples/transform/ChH/full.mlir
index f8d910370bc27..259475ebdbf49 100644
--- a/mlir/test/Examples/transform/ChH/full.mlir
+++ b/mlir/test/Examples/transform/ChH/full.mlir
@@ -380,27 +380,29 @@ module attributes { transform.with_named_sequence } {
 // immediately adjacent fma on vector<64xf32>.
 
 // CHECK:      %[[R0:.+]] = llvm.mlir.undef : !llvm.array<5 x vector<64xf32>>
-// CHECK-NEXT: %[[LINE0:.+]] = llvm.extractvalue %[[V:.+]][0] : !llvm.array<5 x vector<64xf32>>
+
+// CHECK:      %[[V:.+]] = llvm.load %{{.*}} : !llvm.ptr -> !llvm.array<5 x vector<64xf32>>
+// CHECK-NEXT: %[[LINE0:.+]] = llvm.extractvalue %[[V]][0] : !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[FMA0:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE0]])
 // CHECK-SAME: -> vector<64xf32>
 // CHECK-NEXT: %[[R1:.+]] = llvm.insertvalue %[[FMA0]], %[[R0]][0]
 
-// CHECK-NEXT: %[[LINE1:.+]] = llvm.extractvalue %[[V:.+]][1] : !llvm.array<5 x vector<64xf32>>
+// CHECK-NEXT: %[[LINE1:.+]] = llvm.extractvalue %[[V]][1] : !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[FMA1:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE1]])
 // CHECK-SAME: -> vector<64xf32>
 // CHECK-NEXT: %[[R2:.+]] = llvm.insertvalue %[[FMA1]], %[[R1]][1]
 
-// CHECK-NEXT: %[[LINE2:.+]] = llvm.extractvalue %[[V:.+]][2] : !llvm.array<5 x vector<64xf32>>
+// CHECK-NEXT: %[[LINE2:.+]] = llvm.extractvalue %[[V]][2] : !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[FMA2:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE2]])
 // CHECK-SAME: -> vector<64xf32>
 // CHECK-NEXT: %[[R3:.+]] = llvm.insertvalue %[[FMA2]], %[[R2]][2]
 
-// CHECK-NEXT: %[[LINE3:.+]] = llvm.extractvalue %[[V:.+]][3] : !llvm.array<5 x vector<64xf32>>
+// CHECK-NEXT: %[[LINE3:.+]] = llvm.extractvalue %[[V]][3] : !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[FMA3:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE3]])
 // CHECK-SAME: -> vector<64xf32>
 // CHECK-NEXT: %[[R4:.+]] = llvm.insertvalue %[[FMA3]], %[[R3]][3]
 
-// CHECK-NEXT: %[[LINE4:.+]] = llvm.extractvalue %[[V:.+]][4] : !llvm.array<5 x vector<64xf32>>
+// CHECK-NEXT: %[[LINE4:.+]] = llvm.extractvalue %[[V]][4] : !llvm.array<5 x vector<64xf32>>
 // CHECK-NEXT: %[[FMA4:.+]] = llvm.intr.fma(%{{.*}}, %{{.*}}, %[[LINE4]])
 // CHECK-SAME: -> vector<64xf32>
 // CHECK-NEXT: %[[R5:.+]] = llvm.insertvalue %[[FMA4]], %[[R4]][4]
diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
index b0e414d157268..5d27c3e290d50 100644
--- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
+++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:   -func-bufferize -tensor-bufferize -arith-bufferize --canonicalize \
+// RUN:   -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \
 // RUN:   -convert-scf-to-cf --convert-complex-to-standard \
 // RUN:   -finalize-memref-to-llvm -convert-math-to-llvm -convert-math-to-libm \
 // RUN:   -convert-vector-to-llvm -convert-complex-to-llvm \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir
index 43e423d4c3e8e..734e09b7ed103 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-collapse-tensor.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -linalg-bufferize \
-// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \
+// RUN: mlir-opt %s \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-scf-to-cf -expand-strided-metadata -lower-affine -convert-cf-to-llvm -convert-arith-to-llvm \
 // RUN: -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_runner_utils \
+// RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
 // RUN: | FileCheck %s
 
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir
index 84dad567ced3f..a323b0d9f876c 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-elementwise.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -convert-elementwise-to-linalg \
-// RUN: -arith-bufferize -linalg-bufferize -tensor-bufferize -func-bufferize \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -canonicalize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops \
 // RUN: -convert-scf-to-cf -convert-arith-to-llvm -convert-cf-to-llvm --finalize-memref-to-llvm \
 // RUN: -convert-func-to-llvm -reconcile-unrealized-casts | \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir
index db882f7a54d39..45283e173c9f0 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-expand-tensor.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -linalg-bufferize \
-// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \
+// RUN: mlir-opt %s \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-scf-to-cf -expand-strided-metadata -lower-affine -convert-cf-to-llvm -convert-arith-to-llvm \
 // RUN: -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_runner_utils \
+// RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
 // RUN: | FileCheck %s
 
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
index 54a2bbf8d4680..23a07464bb5be 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
@@ -1,6 +1,5 @@
 // RUN: mlir-opt %s -test-linalg-transform-patterns=test-linalg-to-vector-patterns \
-// RUN: -empty-tensor-to-alloc-tensor -linalg-bufferize -arith-bufferize \
-// RUN: -bufferization-bufferize -tensor-bufferize -func-bufferize \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata \
 // RUN: -lower-affine -convert-arith-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir
index 98fce6c020c03..01a0ba26fd7cd 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert-multiple-uses.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -linalg-bufferize \
-// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \
+// RUN: mlir-opt %s \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata  \
 // RUN: -lower-affine -convert-arith-to-llvm --finalize-memref-to-llvm \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir
index cf7d0c762ea36..73d4aff73fb7a 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-subtensor-insert.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -linalg-bufferize \
-// RUN: -arith-bufferize -tensor-bufferize -func-bufferize \
+// RUN: mlir-opt %s \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata \
 // RUN: -lower-affine -convert-arith-to-llvm --finalize-memref-to-llvm \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir
index 38b49cd444df3..ff9ddedf91e17 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-e2e.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt %s -arith-bufferize -linalg-bufferize \
-// RUN: -tensor-bufferize -func-bufferize -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops \
+// RUN: mlir-opt %s \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops \
 // RUN: -convert-arith-to-llvm -convert-scf-to-cf -convert-cf-to-llvm --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
index 41296cdfcb2d5..698191577efe3 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
@@ -1,14 +1,14 @@
 // UNSUPPORTED: asan
-// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -linalg-bufferize -arith-bufferize \
-// RUN: -tensor-bufferize -func-bufferize -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops -convert-scf-to-cf \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN: -finalizing-bufferize -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops -convert-scf-to-cf \
 // RUN: -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils \
 // RUN: | FileCheck %s
 
-// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -linalg-bufferize \
-// RUN: -scf-bufferize -arith-bufferize -tensor-bufferize \
-// RUN: -func-bufferize \
+// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -finalizing-bufferize -convert-linalg-to-loops -convert-scf-to-cf -convert-scf-to-cf \
 // RUN:  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
diff --git a/mlir/test/Integration/Dialect/Memref/print-memref.mlir b/mlir/test/Integration/Dialect/Memref/print-memref.mlir
index b83f3919efd83..f59e220d7461e 100644
--- a/mlir/test/Integration/Dialect/Memref/print-memref.mlir
+++ b/mlir/test/Integration/Dialect/Memref/print-memref.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:   -func-bufferize -arith-bufferize --canonicalize \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \
 // RUN:   -finalize-memref-to-llvm\
 // RUN:   -convert-func-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner \
diff --git a/mlir/test/Integration/Dialect/Memref/verify-memref.mlir b/mlir/test/Integration/Dialect/Memref/verify-memref.mlir
index b7e2a46688f47..431ae0a89d20c 100644
--- a/mlir/test/Integration/Dialect/Memref/verify-memref.mlir
+++ b/mlir/test/Integration/Dialect/Memref/verify-memref.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:   -func-bufferize -arith-bufferize --canonicalize \
+// RUN: -func-bufferize -one-shot-bufferize="bufferize-function-boundaries" --canonicalize \
 // RUN:   -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm\
 // RUN:   -convert-func-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir
index faa129efa63a9..a7c5b91273423 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-mulf-full.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-cf \
-// RUN:  -arith-bufferize -convert-vector-to-llvm="enable-amx" \
+// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN: -convert-scf-to-cf -convert-vector-to-llvm="enable-amx" \
 // RUN:  -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-translate -mlir-to-llvmir | \
 // RUN: %lli --entry-function=entry --mattr="+amx-tile,+amx-int8,+amx-bf16" \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir
index 3ed28fc68acb8..7b7ee54db8c34 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-full.mlir
@@ -1,5 +1,7 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine -convert-scf-to-cf \
-// RUN:  -arith-bufferize -convert-vector-to-llvm="enable-amx" \
+// RUN: mlir-opt %s -convert-vector-to-scf -lower-affine \
+// RUN: -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN: -convert-scf-to-cf \
+// RUN:  -convert-vector-to-llvm="enable-amx" \
 // RUN:  -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-translate -mlir-to-llvmir | \
 // RUN: %lli --entry-function=entry --mattr="+amx-tile,+amx-int8,+amx-bf16" \
diff --git a/mlir/test/Target/Cpp/expressions.mlir b/mlir/test/Target/Cpp/expressions.mlir
index 2eda58902cb1d..aaddd5af874a9 100644
--- a/mlir/test/Target/Cpp/expressions.mlir
+++ b/mlir/test/Target/Cpp/expressions.mlir
@@ -65,15 +65,15 @@ func.func @do_not_inline(%arg0: i32, %arg1: i32, %arg2 : i32) -> i32 {
   return %e : i32
 }
 
-// CPP-DEFAULT:      float paranthesis_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
+// CPP-DEFAULT:      float parentheses_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
 // CPP-DEFAULT-NEXT:   return (float) ([[VAL_1]] + [[VAL_2]] * [[VAL_3]]);
 // CPP-DEFAULT-NEXT: }
 
-// CPP-DECLTOP:      float paranthesis_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
+// CPP-DECLTOP:      float parentheses_for_low_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
 // CPP-DECLTOP-NEXT:   return (float) ([[VAL_1]] + [[VAL_2]] * [[VAL_3]]);
 // CPP-DECLTOP-NEXT: }
 
-func.func @paranthesis_for_low_precedence(%arg0: i32, %arg1: i32, %arg2: i32) -> f32 {
+func.func @parentheses_for_low_precedence(%arg0: i32, %arg1: i32, %arg2: i32) -> f32 {
   %e = emitc.expression : f32 {
     %a = emitc.add %arg0, %arg1 : (i32, i32) -> i32
     %b = emitc.mul %a, %arg2 : (i32, i32) -> i32
@@ -83,6 +83,23 @@ func.func @paranthesis_for_low_precedence(%arg0: i32, %arg1: i32, %arg2: i32) ->
   return %e : f32
 }
 
+// CPP-DEFAULT:      int32_t parentheses_for_same_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
+// CPP-DEFAULT-NEXT:   return [[VAL_3]] / ([[VAL_1]] * [[VAL_2]]);
+// CPP-DEFAULT-NEXT: }
+
+// CPP-DECLTOP:      int32_t parentheses_for_same_precedence(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) {
+// CPP-DECLTOP-NEXT:   return [[VAL_3]] / ([[VAL_1]] * [[VAL_2]]);
+// CPP-DECLTOP-NEXT: }
+func.func @parentheses_for_same_precedence(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 {
+  %e = emitc.expression : i32 {
+      %0 = emitc.mul %arg0, %arg1 : (i32, i32) -> i32
+      %1 = emitc.div %arg2, %0 : (i32, i32) -> i32
+      emitc.yield %1 : i32
+    }
+
+  return %e : i32
+}
+
 // CPP-DEFAULT:      int32_t multiple_uses(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]], int32_t [[VAL_4:v[0-9]+]]) {
 // CPP-DEFAULT-NEXT:   bool [[VAL_5:v[0-9]+]] = bar([[VAL_1]] * [[VAL_2]], [[VAL_3]]) - [[VAL_4]] < [[VAL_2]];
 // CPP-DEFAULT-NEXT:   int32_t [[VAL_6:v[0-9]+]];
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 18324482153a5..9d7e0a7928ab8 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2750,7 +2750,7 @@ def TestGraphLoopOp : TEST_Op<"graph_loop",
 def InferIntRangeType : AnyTypeOf<[AnyInteger, Index]>;
 
 def TestWithBoundsOp : TEST_Op<"with_bounds",
-                          [DeclareOpInterfaceMethods<InferIntRangeInterface>,
+                          [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
                            NoMemoryEffect]> {
   let arguments = (ins APIntAttr:$umin,
                        APIntAttr:$umax,
@@ -2762,7 +2762,7 @@ def TestWithBoundsOp : TEST_Op<"with_bounds",
 }
 
 def TestWithBoundsRegionOp : TEST_Op<"with_bounds_region",
-                          [DeclareOpInterfaceMethods<InferIntRangeInterface>,
+                          [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
                            SingleBlock, NoTerminator]> {
   let arguments = (ins APIntAttr:$umin,
                        APIntAttr:$umax,
@@ -2774,7 +2774,7 @@ def TestWithBoundsRegionOp : TEST_Op<"with_bounds_region",
 }
 
 def TestIncrementOp : TEST_Op<"increment",
-                         [DeclareOpInterfaceMethods<InferIntRangeInterface>,
+                         [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
                          NoMemoryEffect, AllTypesMatch<["value", "result"]>]> {
   let arguments = (ins InferIntRangeType:$value);
   let results = (outs InferIntRangeType:$result);
@@ -2783,7 +2783,8 @@ def TestIncrementOp : TEST_Op<"increment",
 }
 
 def TestReflectBoundsOp : TEST_Op<"reflect_bounds",
-                         [DeclareOpInterfaceMethods<InferIntRangeInterface>, AllTypesMatch<["value", "result"]>]> {
+                         [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
+                          AllTypesMatch<["value", "result"]>]> {
   let arguments = (ins InferIntRangeType:$value,
                        OptionalAttr<APIntAttr>:$umin,
                        OptionalAttr<APIntAttr>:$umax,
diff --git a/mlir/test/python/dialects/scf.py b/mlir/test/python/dialects/scf.py
index ee8d09aa301d9..95a6de86b670d 100644
--- a/mlir/test/python/dialects/scf.py
+++ b/mlir/test/python/dialects/scf.py
@@ -176,6 +176,56 @@ def range_loop_7(lb, ub, step, memref_v):
             memref.store(add, memref_v, [i])
             scf.yield_([])
 
+    # CHECK:  func.func @loop_yield_1(%[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: memref<10xindex>) {
+    # CHECK:    %[[VAL_4:.*]] = arith.constant 0 : index
+    # CHECK:    %[[VAL_5:.*]] = arith.constant 0 : index
+    # CHECK:    %[[VAL_6:.*]] = arith.constant 0 : index
+    # CHECK:    %[[VAL_7:.*]] = arith.constant 100 : index
+    # CHECK:    %[[VAL_8:.*]] = arith.constant 1 : index
+    # CHECK:    %[[VAL_10:.*]] = scf.for %[[IV:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_8]] iter_args(%[[ITER:.*]] = %[[VAL_4]]) -> (index) {
+    # CHECK:      %[[VAL_9:.*]] = arith.addi %[[ITER]], %[[IV]] : index
+    # CHECK:      scf.yield %[[VAL_9]] : index
+    # CHECK:    }
+    # CHECK:    memref.store %[[VAL_10]], %[[VAL_3]]{{\[}}%[[VAL_5]]] : memref<10xindex>
+    # CHECK:    return
+    # CHECK:  }
+    @func.FuncOp.from_py_func(index_type, index_type, index_type, memref_t)
+    def loop_yield_1(lb, ub, step, memref_v):
+        sum = arith.ConstantOp.create_index(0)
+        c0 = arith.ConstantOp.create_index(0)
+        for i, loc_sum, sum in scf.for_(0, 100, 1, [sum]):
+            loc_sum = arith.addi(loc_sum, i)
+            scf.yield_([loc_sum])
+        memref.store(sum, memref_v, [c0])
+
+    # CHECK:  func.func @loop_yield_2(%[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: memref<10xindex>) {
+    # CHECK:    %[[c0:.*]] = arith.constant 0 : index
+    # CHECK:    %[[c2:.*]] = arith.constant 2 : index
+    # CHECK:    %[[REF1:.*]] = arith.constant 0 : index
+    # CHECK:    %[[REF2:.*]] = arith.constant 1 : index
+    # CHECK:    %[[VAL_6:.*]] = arith.constant 0 : index
+    # CHECK:    %[[VAL_7:.*]] = arith.constant 100 : index
+    # CHECK:    %[[VAL_8:.*]] = arith.constant 1 : index
+    # CHECK:    %[[RES:.*]] = scf.for %[[IV:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_8]] iter_args(%[[ITER1:.*]] = %[[c0]], %[[ITER2:.*]] = %[[c2]]) -> (index, index) {
+    # CHECK:      %[[VAL_9:.*]] = arith.addi %[[ITER1]], %[[IV]] : index
+    # CHECK:      %[[VAL_10:.*]] = arith.addi %[[ITER2]], %[[IV]] : index
+    # CHECK:      scf.yield %[[VAL_9]], %[[VAL_10]] : index, index
+    # CHECK:    }
+    # CHECK:    return
+    # CHECK:  }
+    @func.FuncOp.from_py_func(index_type, index_type, index_type, memref_t)
+    def loop_yield_2(lb, ub, step, memref_v):
+        sum1 = arith.ConstantOp.create_index(0)
+        sum2 = arith.ConstantOp.create_index(2)
+        c0 = arith.ConstantOp.create_index(0)
+        c1 = arith.ConstantOp.create_index(1)
+        for i, [loc_sum1, loc_sum2], [sum1, sum2] in scf.for_(0, 100, 1, [sum1, sum2]):
+            loc_sum1 = arith.addi(loc_sum1, i)
+            loc_sum2 = arith.addi(loc_sum2, i)
+            scf.yield_([loc_sum1, loc_sum2])
+        memref.store(sum1, memref_v, [c0])
+        memref.store(sum2, memref_v, [c1])
+
 
 @constructAndPrintInModule
 def testOpsAsArguments():
diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index f72007849e36e..13f08b142b876 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -155,11 +155,11 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
 
   // Initialize all the plugins that have associated images.
   for (auto &Plugin : Plugins) {
-    if (Plugin->is_initialized())
-      continue;
-
     // Extract the exectuable image and extra information if availible.
     for (int32_t i = 0; i < Desc->NumDeviceImages; ++i) {
+      if (Plugin->is_initialized())
+        continue;
+
       if (!Plugin->is_valid_binary(&Desc->DeviceImages[i],
                                    /*Initialized=*/false))
         continue;
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 612d784be8a55..62c35c19e6b45 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -177,6 +177,7 @@ else()
   add_llvm_library(omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES} PARTIAL_SOURCES_INTENDED
     LINK_LIBS ${LIBOMP_CONFIGURED_LIBFLAGS} ${LIBOMP_DL_LIBS}
     LINK_COMPONENTS Support
+    BUILDTREE_ONLY
     )
   # libomp must be a C++ library such that it can link libLLVMSupport
   set(LIBOMP_LINKER_LANGUAGE CXX)
diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp
index de77e25db2d39..d7658077e83ae 100644
--- a/openmp/tools/archer/ompt-tsan.cpp
+++ b/openmp/tools/archer/ompt-tsan.cpp
@@ -19,6 +19,7 @@
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
+#include <dlfcn.h>
 #include <inttypes.h>
 #include <iostream>
 #include <list>
@@ -29,7 +30,6 @@
 #include <unistd.h>
 #include <unordered_map>
 #include <vector>
-#include <dlfcn.h>
 
 #include "omp-tools.h"
 
@@ -146,18 +146,28 @@ void __attribute__((weak)) __tsan_flush_memory() {}
 static ArcherFlags *archer_flags;
 
 #ifndef TsanHappensBefore
+
+template <typename... Args> static void __ompt_tsan_func(Args...) {}
+
+#define DECLARE_TSAN_FUNCTION(name, ...)                                       \
+  static void (*name)(__VA_ARGS__) = __ompt_tsan_func<__VA_ARGS__>;
+
 // Thread Sanitizer is a tool that finds races in code.
 // See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
 // tsan detects these exact functions by name.
 extern "C" {
-static void (*AnnotateHappensAfter)(const char *, int, const volatile void *);
-static void (*AnnotateHappensBefore)(const char *, int, const volatile void *);
-static void (*AnnotateIgnoreWritesBegin)(const char *, int);
-static void (*AnnotateIgnoreWritesEnd)(const char *, int);
-static void (*AnnotateNewMemory)(const char *, int, const volatile void *,
-                                 size_t);
-static void (*__tsan_func_entry)(const void *);
-static void (*__tsan_func_exit)(void);
+DECLARE_TSAN_FUNCTION(AnnotateHappensAfter, const char *, int,
+                      const volatile void *)
+DECLARE_TSAN_FUNCTION(AnnotateHappensBefore, const char *, int,
+                      const volatile void *)
+DECLARE_TSAN_FUNCTION(AnnotateIgnoreWritesBegin, const char *, int)
+DECLARE_TSAN_FUNCTION(AnnotateIgnoreWritesEnd, const char *, int)
+DECLARE_TSAN_FUNCTION(AnnotateNewMemory, const char *, int,
+                      const volatile void *, size_t)
+DECLARE_TSAN_FUNCTION(__tsan_func_entry, const void *)
+DECLARE_TSAN_FUNCTION(__tsan_func_exit)
+
+// RunningOnValgrind is used to detect absence of TSan and must intentionally be a nullptr.
 static int (*RunningOnValgrind)(void);
 }
 
@@ -1142,7 +1152,10 @@ static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id,
 
 #define findTsanFunction(f, fSig)                                              \
   do {                                                                         \
-    if (NULL == (f = fSig dlsym(RTLD_DEFAULT, #f)))                            \
+    void *fp = dlsym(RTLD_DEFAULT, #f);                                        \
+    if (fp)                                                                    \
+      f = fSig fp;                                                             \
+    else                                                                       \
       printf("Unable to find TSan function " #f ".\n");                        \
   } while (0)
 
diff --git a/polly/include/polly/ScheduleTreeTransform.h b/polly/include/polly/ScheduleTreeTransform.h
index ee504c4e5f524..6bd5a3abf9ea2 100644
--- a/polly/include/polly/ScheduleTreeTransform.h
+++ b/polly/include/polly/ScheduleTreeTransform.h
@@ -47,9 +47,9 @@ struct ScheduleTreeVisitor {
       return getDerived().visitSequence(Node.as<isl::schedule_node_sequence>(),
                                         std::forward<Args>(args)...);
     case isl_schedule_node_set:
+      assert(isl_schedule_node_n_children(Node.get()) >= 2);
       return getDerived().visitSet(Node.as<isl::schedule_node_set>(),
                                    std::forward<Args>(args)...);
-      assert(isl_schedule_node_n_children(Node.get()) >= 2);
     case isl_schedule_node_leaf:
       assert(isl_schedule_node_n_children(Node.get()) == 0);
       return getDerived().visitLeaf(Node.as<isl::schedule_node_leaf>(),
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 446499cf15d7b..70ec3a48a5e2e 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -127,6 +127,11 @@ libc_support_library(
     hdrs = ["hdr/time_macros.h"],
 )
 
+libc_support_library(
+    name = "hdr_float_macros",
+    hdrs = ["hdr/float_macros.h"],
+)
+
 ############################ Type Proxy Header Files ###########################
 
 libc_support_library(
@@ -189,7 +194,7 @@ libc_support_library(
         ":__support_macros_properties_compiler",
         ":__support_macros_properties_cpu_features",
         ":__support_macros_properties_os",
-        ":llvm_libc_macros_float_macros",
+        ":hdr_float_macros",
         ":llvm_libc_types_float128",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index d1a2c6f11d98a..a67f20533ae22 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2232,7 +2232,7 @@ llvm_target_lib_list = [lib for lib in [
             ("-gen-callingconv", "lib/Target/X86/X86GenCallingConv.inc"),
             ("-gen-subtarget", "lib/Target/X86/X86GenSubtargetInfo.inc"),
             ("-gen-x86-fold-tables -asmwriternum=1", "lib/Target/X86/X86GenFoldTables.inc"),
-            ("-gen-x86-compress-evex-tables", "lib/Target/X86/X86GenCompressEVEXTables.inc"),
+            ("-gen-x86-instr-mapping", "lib/Target/X86/X86GenInstrMapping.inc"),
             ("-gen-exegesis", "lib/Target/X86/X86GenExegesis.inc"),
             ("-gen-x86-mnemonic-tables -asmwriternum=1", "lib/Target/X86/X86GenMnemonicTables.inc"),
         ],
diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index 6605ea60df99e..629977cc11d68 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -198,4 +198,7 @@
 /* Define if plugins enabled */
 #cmakedefine LLVM_ENABLE_PLUGINS
 
+/* Define if logf128 is available */
+#cmakedefine LLVM_HAS_LOGF128
+
 #endif